aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/Maintainers.md2
-rw-r--r--llvm/docs/AMDGPUUsage.rst59
-rw-r--r--llvm/docs/CIBestPractices.rst28
-rw-r--r--llvm/docs/CommandGuide/llvm-objdump.rst26
-rw-r--r--llvm/docs/DirectX/RootSignatures.rst245
-rw-r--r--llvm/docs/DirectXUsage.rst1
-rw-r--r--llvm/docs/GettingStarted.rst42
-rw-r--r--llvm/docs/LangRef.rst106
-rw-r--r--llvm/docs/ProgrammersManual.rst38
-rw-r--r--llvm/docs/ReleaseNotes.md198
-rw-r--r--llvm/docs/TestingGuide.rst4
-rw-r--r--llvm/docs/YamlIO.rst136
-rw-r--r--llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst2
-rw-r--r--llvm/include/llvm/ADT/ArrayRef.h4
-rw-r--r--llvm/include/llvm/ADT/DenseMapInfo.h86
-rw-r--r--llvm/include/llvm/Analysis/LoopAccessAnalysis.h10
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfo.h2
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfoImpl.h4
-rw-r--r--llvm/include/llvm/AsmParser/LLToken.h1
-rw-r--r--llvm/include/llvm/BinaryFormat/ELF.h21
-rw-r--r--llvm/include/llvm/BinaryFormat/SFrame.h28
-rw-r--r--llvm/include/llvm/BinaryFormat/SFrameConstants.def39
-rw-r--r--llvm/include/llvm/CodeGen/BasicTTIImpl.h9
-rw-r--r--llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h14
-rw-r--r--llvm/include/llvm/CodeGen/LinkAllAsmWriterComponents.h12
-rw-r--r--llvm/include/llvm/CodeGen/LinkAllCodegenComponents.h12
-rw-r--r--llvm/include/llvm/CodeGen/MachineInstrBundle.h7
-rw-r--r--llvm/include/llvm/CodeGen/MachineScheduler.h18
-rw-r--r--llvm/include/llvm/CodeGen/Passes.h4
-rw-r--r--llvm/include/llvm/CodeGen/SelectionDAG.h15
-rw-r--r--llvm/include/llvm/CodeGen/SelectionDAGNodes.h16
-rw-r--r--llvm/include/llvm/CodeGen/TargetLowering.h20
-rw-r--r--llvm/include/llvm/CodeGen/TargetSubtargetInfo.h5
-rw-r--r--llvm/include/llvm/Config/abi-breaking.h.cmake33
-rw-r--r--llvm/include/llvm/Demangle/Demangle.h59
-rw-r--r--llvm/include/llvm/Demangle/DemangleConfig.h20
-rw-r--r--llvm/include/llvm/Demangle/ItaniumDemangle.h3
-rw-r--r--llvm/include/llvm/Demangle/MicrosoftDemangle.h7
-rw-r--r--llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h60
-rw-r--r--llvm/include/llvm/ExecutionEngine/MCJIT.h14
-rw-r--r--llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h44
-rw-r--r--llvm/include/llvm/Frontend/OpenMP/ClauseT.h9
-rw-r--r--llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h5
-rw-r--r--llvm/include/llvm/IR/CallingConv.h8
-rw-r--r--llvm/include/llvm/IR/DebugInfo.h29
-rw-r--r--llvm/include/llvm/IR/IntrinsicsAMDGPU.td54
-rw-r--r--llvm/include/llvm/IR/IntrinsicsNVVM.td65
-rw-r--r--llvm/include/llvm/IR/IntrinsicsWebAssembly.td4
-rw-r--r--llvm/include/llvm/IR/NVVMIntrinsicUtils.h83
-rw-r--r--llvm/include/llvm/IR/PassInstrumentation.h2
-rw-r--r--llvm/include/llvm/IR/PatternMatch.h40
-rw-r--r--llvm/include/llvm/InitializePasses.h1
-rw-r--r--llvm/include/llvm/LinkAllIR.h8
-rw-r--r--llvm/include/llvm/LinkAllPasses.h15
-rw-r--r--llvm/include/llvm/MC/DXContainerRootSignature.h5
-rw-r--r--llvm/include/llvm/MC/MCAsmBackend.h28
-rw-r--r--llvm/include/llvm/MC/MCObjectStreamer.h20
-rw-r--r--llvm/include/llvm/MC/MCSection.h549
-rw-r--r--llvm/include/llvm/MC/MCSectionCOFF.h1
-rw-r--r--llvm/include/llvm/MC/MCSectionELF.h5
-rw-r--r--llvm/include/llvm/MC/MCSectionGOFF.h2
-rw-r--r--llvm/include/llvm/MC/MCStreamer.h15
-rw-r--r--llvm/include/llvm/MC/MCTargetOptions.h3
-rw-r--r--llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h2
-rw-r--r--llvm/include/llvm/Object/ELFObjectFile.h1
-rw-r--r--llvm/include/llvm/Object/SFrameParser.h48
-rw-r--r--llvm/include/llvm/Passes/MachinePassRegistry.def1
-rw-r--r--llvm/include/llvm/Support/AArch64AttributeParser.h11
-rw-r--r--llvm/include/llvm/Support/AlwaysTrue.h25
-rw-r--r--llvm/include/llvm/Support/CommandLine.h17
-rw-r--r--llvm/include/llvm/Support/DebugLog.h68
-rw-r--r--llvm/include/llvm/TargetParser/AArch64TargetParser.h4
-rw-r--r--llvm/include/llvm/Transforms/Utils/Local.h1
-rw-r--r--llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h1
-rw-r--r--llvm/include/llvm/Transforms/Utils/ProfileVerify.h36
-rw-r--r--llvm/lib/Analysis/ConstantFolding.cpp168
-rw-r--r--llvm/lib/Analysis/LoopAccessAnalysis.cpp23
-rw-r--r--llvm/lib/Analysis/ProfileSummaryInfo.cpp14
-rw-r--r--llvm/lib/Analysis/ScalarEvolution.cpp21
-rw-r--r--llvm/lib/Analysis/StackLifetime.cpp5
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp18
-rw-r--r--llvm/lib/AsmParser/LLLexer.cpp1
-rw-r--r--llvm/lib/AsmParser/LLParser.cpp32
-rw-r--r--llvm/lib/BinaryFormat/CMakeLists.txt1
-rw-r--r--llvm/lib/BinaryFormat/SFrame.cpp37
-rw-r--r--llvm/lib/Bitcode/Reader/BitcodeReader.cpp40
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/ARMException.cpp2
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp3
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp6
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp2
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp32
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h11
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/WinException.cpp16
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/WinException.h1
-rw-r--r--llvm/lib/CodeGen/CodeGen.cpp1
-rw-r--r--llvm/lib/CodeGen/CodeGenPrepare.cpp4
-rw-r--r--llvm/lib/CodeGen/ExpandFp.cpp33
-rw-r--r--llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp66
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp20
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp6
-rw-r--r--llvm/lib/CodeGen/InterleavedAccessPass.cpp220
-rw-r--r--llvm/lib/CodeGen/MIRPrinter.cpp3
-rw-r--r--llvm/lib/CodeGen/MachineInstrBundle.cpp31
-rw-r--r--llvm/lib/CodeGen/MachineLICM.cpp1
-rw-r--r--llvm/lib/CodeGen/MachinePipeliner.cpp4
-rw-r--r--llvm/lib/CodeGen/MachineScheduler.cpp24
-rw-r--r--llvm/lib/CodeGen/SafeStack.cpp7
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp43
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp7
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp50
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp9
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp26
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp47
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp5
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp33
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp3
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp15
-rw-r--r--llvm/lib/CodeGen/StackProtector.cpp10
-rw-r--r--llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp1
-rw-r--r--llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp3
-rw-r--r--llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp2
-rw-r--r--llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp57
-rw-r--r--llvm/lib/FileCheck/FileCheck.cpp6
-rw-r--r--llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp474
-rw-r--r--llvm/lib/IR/AsmWriter.cpp8
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp51
-rw-r--r--llvm/lib/IR/DebugInfo.cpp87
-rw-r--r--llvm/lib/IR/Function.cpp1
-rw-r--r--llvm/lib/IR/PassInstrumentation.cpp6
-rw-r--r--llvm/lib/IR/Type.cpp18
-rw-r--r--llvm/lib/IR/Value.cpp55
-rw-r--r--llvm/lib/IR/Verifier.cpp63
-rw-r--r--llvm/lib/MC/MCAsmStreamer.cpp21
-rw-r--r--llvm/lib/MC/MCAssembler.cpp220
-rw-r--r--llvm/lib/MC/MCCodeView.cpp17
-rw-r--r--llvm/lib/MC/MCDwarf.cpp11
-rw-r--r--llvm/lib/MC/MCELFStreamer.cpp2
-rw-r--r--llvm/lib/MC/MCExpr.cpp9
-rw-r--r--llvm/lib/MC/MCFragment.cpp24
-rw-r--r--llvm/lib/MC/MCGOFFStreamer.cpp20
-rw-r--r--llvm/lib/MC/MCMachOStreamer.cpp12
-rw-r--r--llvm/lib/MC/MCObjectStreamer.cpp99
-rw-r--r--llvm/lib/MC/MCParser/AsmParser.cpp16
-rw-r--r--llvm/lib/MC/MCParser/MCTargetAsmParser.cpp7
-rw-r--r--llvm/lib/MC/MCSection.cpp28
-rw-r--r--llvm/lib/MC/MCSectionCOFF.cpp4
-rw-r--r--llvm/lib/MC/MCSectionELF.cpp2
-rw-r--r--llvm/lib/MC/MCStreamer.cpp8
-rw-r--r--llvm/lib/MC/MCTargetOptions.cpp3
-rw-r--r--llvm/lib/MC/MCTargetOptionsCommandFlags.cpp7
-rw-r--r--llvm/lib/MC/MCWin64EH.cpp11
-rw-r--r--llvm/lib/MC/MCWinCOFFStreamer.cpp40
-rw-r--r--llvm/lib/MC/MCXCOFFStreamer.cpp2
-rw-r--r--llvm/lib/MC/MachObjectWriter.cpp23
-rw-r--r--llvm/lib/MC/WasmObjectWriter.cpp19
-rw-r--r--llvm/lib/MC/WinCOFFObjectWriter.cpp31
-rw-r--r--llvm/lib/ObjCopy/MachO/MachOObject.h4
-rw-r--r--llvm/lib/ObjCopy/MachO/MachOWriter.cpp4
-rw-r--r--llvm/lib/Object/CMakeLists.txt1
-rw-r--r--llvm/lib/Object/ELFObjectFile.cpp17
-rw-r--r--llvm/lib/Object/SFrameParser.cpp55
-rw-r--r--llvm/lib/Passes/PassBuilder.cpp6
-rw-r--r--llvm/lib/Passes/PassRegistry.def2
-rw-r--r--llvm/lib/ProfileData/InstrProfReader.cpp2
-rw-r--r--llvm/lib/Support/AArch64AttributeParser.cpp27
-rw-r--r--llvm/lib/Support/CommandLine.cpp27
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp66
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp77
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h3
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp5
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td43
-rw-r--r--llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp26
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp23
-rw-r--r--llvm/lib/Target/AArch64/AArch64StackTagging.cpp6
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h3
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td69
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp32
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp375
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h46
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp41
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp268
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h26
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp317
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp239
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp36
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp143
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp47
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td192
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp185
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h1
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h35
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp45
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h6
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/R600TargetMachine.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h24
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp87
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp121
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp31
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td13
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td87
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp36
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SISchedule.td15
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td28
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp42
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h14
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td17
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td40
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td347
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td2
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp19
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h3
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp6
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp14
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp2
-rw-r--r--llvm/lib/Target/BPF/BPF.h2
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp1
-rw-r--r--llvm/lib/Target/DirectX/DXILDataScalarization.cpp46
-rw-r--r--llvm/lib/Target/DirectX/DXILFlattenArrays.cpp9
-rw-r--r--llvm/lib/Target/DirectX/DXILLegalizePass.cpp97
-rw-r--r--llvm/lib/Target/DirectX/DXILPrepare.cpp23
-rw-r--r--llvm/lib/Target/DirectX/DXILResourceAccess.cpp3
-rw-r--r--llvm/lib/Target/DirectX/DXILRootSignature.cpp471
-rw-r--r--llvm/lib/Target/DirectX/DXILRootSignature.h11
-rw-r--r--llvm/lib/Target/DirectX/DXILShaderFlags.cpp15
-rw-r--r--llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp36
-rw-r--r--llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp57
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrFormats.td1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td21
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td5
-rw-r--r--llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td414
-rw-r--r--llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td642
-rw-r--r--llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td179
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp226
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td14
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td17
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp117
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h9
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp10
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp11
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp43
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp146
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.h5
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td6
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td47
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrFuture.td40
-rw-r--r--llvm/lib/Target/PowerPC/PPCSubtarget.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCSubtarget.h3
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp73
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h9
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp19
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVCallingConv.td14
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td6
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.cpp53
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp89
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h15
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp94
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h9
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrFormats.td1
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoP.td105
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoV.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td461
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td52
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td15
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td42
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZc.td14
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td10
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp231
-rw-r--r--llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp96
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td200
-rw-r--r--llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp12
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h6
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp8
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp28
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h4
-rw-r--r--llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp44
-rw-r--r--llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp9
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVAPI.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp6
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp82
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp8
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.cpp23
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.h2
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp1
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp3
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.cpp4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp12
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp99
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp27
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp55
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h1
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp15
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h4
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp28
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp61
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp1
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.h1
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp24
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h3
-rw-r--r--llvm/lib/Target/X86/X86InterleavedAccess.cpp8
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp208
-rw-r--r--llvm/lib/TargetParser/AArch64TargetParser.cpp12
-rw-r--r--llvm/lib/TargetParser/TargetParser.cpp1
-rw-r--r--llvm/lib/TargetParser/Triple.cpp1
-rw-r--r--llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp159
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroEarly.cpp6
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroFrame.cpp5
-rw-r--r--llvm/lib/Transforms/Coroutines/Coroutines.cpp3
-rw-r--r--llvm/lib/Transforms/Coroutines/SpillUtils.cpp4
-rw-r--r--llvm/lib/Transforms/HipStdPar/HipStdPar.cpp220
-rw-r--r--llvm/lib/Transforms/IPO/GlobalOpt.cpp12
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp26
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp37
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp70
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp6
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp26
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h3
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp4
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp81
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp35
-rw-r--r--llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp13
-rw-r--r--llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp193
-rw-r--r--llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp4
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfUse.cpp253
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp8
-rw-r--r--llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h16
-rw-r--r--llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp192
-rw-r--r--llvm/lib/Transforms/Scalar/ConstraintElimination.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/JumpThreading.cpp5
-rw-r--r--llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp1
-rw-r--r--llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp21
-rw-r--r--llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/NewGVN.cpp8
-rw-r--r--llvm/lib/Transforms/Scalar/Scalarizer.cpp8
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp49
-rw-r--r--llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp3
-rw-r--r--llvm/lib/Transforms/Utils/CMakeLists.txt1
-rw-r--r--llvm/lib/Transforms/Utils/CodeExtractor.cpp8
-rw-r--r--llvm/lib/Transforms/Utils/Debugify.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/LCSSA.cpp19
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp58
-rw-r--r--llvm/lib/Transforms/Utils/LoopRotationUtils.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/MemoryOpRemark.cpp5
-rw-r--r--llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp6
-rw-r--r--llvm/lib/Transforms/Utils/PredicateInfo.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/ProfileVerify.cpp129
-rw-r--r--llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp36
-rw-r--r--llvm/lib/Transforms/Utils/SSAUpdater.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp10
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp135
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h19
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp4
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp96
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp21
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h2
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp107
-rw-r--r--llvm/test/Analysis/BasicAA/modref.ll14
-rw-r--r--llvm/test/Analysis/CallGraph/ignore-assumelike-calls.ll13
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll21
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll27
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll54
-rw-r--r--llvm/test/Analysis/CostModel/ARM/arith.ll2405
-rw-r--r--llvm/test/Analysis/CostModel/RISCV/cast-sat.ll608
-rw-r--r--llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll71
-rw-r--r--llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll223
-rw-r--r--llvm/test/Analysis/CostModel/X86/free-intrinsics.ll15
-rw-r--r--llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll15
-rw-r--r--llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll15
-rw-r--r--llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll39
-rw-r--r--llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll13
-rw-r--r--llvm/test/Analysis/MemorySSA/lifetime-simple.ll6
-rw-r--r--llvm/test/Analysis/MemorySSA/pr39197.ll7
-rw-r--r--llvm/test/Analysis/MemorySSA/pr43044.ll4
-rw-r--r--llvm/test/Analysis/MemorySSA/pr43427.ll7
-rw-r--r--llvm/test/Analysis/MemorySSA/pr43438.ll5
-rw-r--r--llvm/test/Analysis/MemorySSA/renamephis.ll2
-rw-r--r--llvm/test/Analysis/ScalarEvolution/add-expr-pointer-operand-sorting.ll31
-rw-r--r--llvm/test/Analysis/ScalarEvolution/sdiv.ll4
-rw-r--r--llvm/test/Analysis/ScalarEvolution/srem.ll4
-rw-r--r--llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll77
-rw-r--r--llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll9
-rw-r--r--llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll9
-rw-r--r--llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll57
-rw-r--r--llvm/test/Assembler/difile-empty-source.ll12
-rw-r--r--llvm/test/Bitcode/compatibility.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-gep-flags.ll8
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll10
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir109
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-mops.ll188
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-smull.ll67
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll107
-rw-r--r--llvm/test/CodeGen/AArch64/abds-neg.ll20
-rw-r--r--llvm/test/CodeGen/AArch64/abds.ll15
-rw-r--r--llvm/test/CodeGen/AArch64/abdu-neg.ll14
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll15
-rw-r--r--llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir98
-rw-r--r--llvm/test/CodeGen/AArch64/combine-sdiv.ll1
-rw-r--r--llvm/test/CodeGen/AArch64/load-zext-bitcast.ll82
-rw-r--r--llvm/test/CodeGen/AArch64/rem-by-const.ll1
-rw-r--r--llvm/test/CodeGen/AArch64/stack-tagging.ll50
-rw-r--r--llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll42
-rw-r--r--llvm/test/CodeGen/AArch64/urem-lkk.ll68
-rw-r--r--llvm/test/CodeGen/AArch64/wineh-reuse-catch-alloca.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-rtn.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-schedule.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll92
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir19
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir22
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-prefetch.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-zext-vec-index.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir30
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir30
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir30
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir29
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.ptr.buffer.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.p1.f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.sleep.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.softwqm.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd-with-ret.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wwm.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll95
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll522
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir3
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll145
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir353
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll59
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll59
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll59
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll59
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and-s1.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-trunc.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir40
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/acc-ldst.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/add-max.ll295
-rw-r--r--llvm/test/CodeGen/AMDGPU/add.i16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/add.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/add.v2i16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/add3.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/add_i1.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/add_i128.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/add_i64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/add_shl.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/add_u64.ll129
-rw-r--r--llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/addrspacecast-initializer.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/adjust-writemask-vectorized.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/agpr-csr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/agpr-register-count.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/agpr-remat.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/alignbit-pat.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/always-uniform.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/amd.endpgm.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-nsa-threshold.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-callable.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-cs.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-es.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-gs.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-hs.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-ls.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-ps.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-vs.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/and-gcn.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/and.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/and_or.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/andorbitset.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/andorn2.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/anyext.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/are-loads-from-same-base-ptr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_load_add.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_load_local.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_store_local.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/attr-unparseable.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll68
-rw-r--r--llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/basic-branch.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/basic-call-return.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/basic-loop.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16-conversions.ll147
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16-math.ll383
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll14129
-rw-r--r--llvm/test/CodeGen/AMDGPU/bfe-patterns.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/bfi_int.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/bfi_nested.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/bfm.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/bitcast-v4f16-v4i16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/bitreverse.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/br_cc.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-relaxation.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-uniformity.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/bswap.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-schedule.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/bug-deadlanes.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-argument-types.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-c-function.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-constexpr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-encoding.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-return-types.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/calling-conventions.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/captured-frame-index.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/carryout-selection.ll383
-rw-r--r--llvm/test/CodeGen/AMDGPU/cc-sgpr-limit.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/cc-sgpr-over-limit.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/clamp-modifier.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/clamp.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/cluster_stores.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir16
-rw-r--r--llvm/test/CodeGen/AMDGPU/coalescer_remat.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/code-size-estimate.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/collapse-endcf.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/combine-and-sext-bool.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/commute-compares.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/commute-shifts.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/commute_modifiers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/computeKnownBits-scalar-to-vector-crash.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/concat_vectors.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/convergence-tokens.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/copy_to_scc.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/cse-convergent.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/cttz.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/cube.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/dag-divergence.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/dagcomb-shuffle-vecextend-non2.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/dagcombine-select.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/debug-value.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/debug-value2.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/debug.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/default-fp-mode.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/div_i128.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/div_v2i128.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/dpp64_combine.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/dpp_combine.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_gws_align.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_read2.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_read2st64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_write2.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_write2st64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll5
-rw-r--r--llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/early-if-convert.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/elf.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/else.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/empty-function.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/extload-align.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/extload-private.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/extload.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract-lowbits.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract-subvector-equal-length.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract-subvector.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fabs.f16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fabs.f64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fadd.f16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/fadd.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fadd64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll44
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/fceil.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fceil64.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcmp.f16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcmp64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fconst64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll298
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv.f16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv.f64.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdot2.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/fence-barrier.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ffloor.f64.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/ffloor.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/finalizebundle.mir18
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll818
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll201
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir43
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll86
-rw-r--r--llvm/test/CodeGen/AMDGPU/fma-combine.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fma.f16.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/fma.f64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fma.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmac.sdwa.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmax3.f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmax3.ll205
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmax_legacy.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmaximum3.v2f16.ll99
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmed3.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmin3.ll273
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmin_legacy.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fminimum3.v2f16.ll99
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmul.f16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmul64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/fnearbyint.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-combines-gfx1200.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-combines.si.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg.f16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg.f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-fabs.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-fmaak-bug.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-classify.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-min-max-image-atomics.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp_to_sint.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp_to_uint.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fpext-free.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/fpext.f16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fpext.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptosi.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptoui.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/fract.f64.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fract.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/frem.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/fshl.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/fshr.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsub.f16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsub.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsub64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/function-args.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/function-returns.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/gds-allocation.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/gds-atomic.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/gep-address-space.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-address.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-constant.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-directive.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-extload-i16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll131
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_smrd.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/half.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir33
-rw-r--r--llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/hoist-cond.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/icmp.i16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/icmp64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/idiv-licm.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/image-schedule.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/img-nouse-adjust.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/imm.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/imm16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/immv216.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/implicit-def-muse.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/indirect-call.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/indirect-private-64.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/infinite-loop.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir178
-rw-r--r--llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir90
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-asm.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-calls.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-constraints.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/inlineasm-16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_subreg.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/ipra.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll103
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-intrinsic-dyn-vgpr-w32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll191
-rw-r--r--llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/kernel-args.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/known-never-nan.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/known-never-snan.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-bounds.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-output-queue.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-relocs.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/literal64.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll67
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll166
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.fi.b32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.br.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.o.dim.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.x.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.dim.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll201
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.i24.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.u24.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.d16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.d16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.nop.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sethalt.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx12.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.load.d16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.load.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot4.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot8.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll552
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll105
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll67
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll38
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp.ll122
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp10.ll122
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp2.ll55
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log.ll163
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log10.ll163
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log2.ll46
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.mulo.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll576
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.r600.dot4.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.rint.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll38
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.sin.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-f32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-f64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i1.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i32.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i8.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-hi16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-lo16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local-f32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local-f64.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local-i1.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local-i16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local-i32.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local-i64.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local-i8.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local.128.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local.96.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-range-metadata-assert.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-select-ptr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir104
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomics.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomics64.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-memory.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-stack-slot-offset.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/loop-idiom.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll370
-rw-r--r--llvm/test/CodeGen/AMDGPU/loop-prefetch.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/loop_break.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-combine.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll634
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll189
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll540
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad.u16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad_64_32.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad_int24.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad_uint24.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/madak.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/madmk.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/mai-hazards.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/mai-inline.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/max-sgprs.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/max.i16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/max.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/max3.ll59
-rw-r--r--llvm/test/CodeGen/AMDGPU/med3-no-simplify.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir46
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir60
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory_clause.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/merge-store-crash.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/merge-stores.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/mesa3d.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/mesa_regression.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-loop.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx942.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/min.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/min3.ll59
-rw-r--r--llvm/test/CodeGen/AMDGPU/minimummaximum.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/minmax.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/missing-store.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-worklist.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/movreld-bug.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr-non-ptr-intrinsics.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/mubuf.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul.i16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul.ll434
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul_int24.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/multilevel-break.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/nand.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/nested-calls.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/noop-shader-O0.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/nor.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/nsa-reassign.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/nullptr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/offset-split-flat.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/offset-split-global.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/omod.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/operand-folding.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/operand-spacing.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/optimize-compare.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/or.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/or3.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/pack.v2f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/pack.v2i16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/packed-fp32.ll1225
-rw-r--r--llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/permute.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/permute_i8.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/preload-kernargs.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/preserve-hi16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/preserve-user-waitcnt.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/prologue-epilogue-markers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/r600-constant-array-fixup.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/r600.bitcast.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/r600.global_atomics.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/r600.sub.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ran-out-of-registers-error-all-regs-reserved.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/rcp_iflag.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/read_register.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/readcyclecounter.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/readsteadycounter.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/recursion.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/reduce-build-vec-ext-to-ext-build-vec.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/reduction.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/register-count-comments.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/register-killed-error-after-alloc-failure1.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/reject-agpr-usage-before-gfx908.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/rel32.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/rem_i128.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/resource-usage-pal.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ret.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/ret_jump.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/returnaddress.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/rotate-add.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/rotl.i64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/rotl.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/rotr.i64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/rotr.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_addk_i32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_movk_i32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sad.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/saddo.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/salu-to-valu.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/save-fp.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll426
-rw-r--r--llvm/test/CodeGen/AMDGPU/scale-offset-global.ll351
-rw-r--r--llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll322
-rw-r--r--llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll372
-rw-r--r--llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sched-setprio.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-avoid-spills.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-fs-loop.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-if-2.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-if.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-ilp.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/scratch-buffer.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/scratch-simple.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdiv64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdwa-op64-test.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/select-i1.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/select-opt.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/select-vectors.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/select.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/select64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/selectcc.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/setcc-opt.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/setcc-sext.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/setcc.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/setcc64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/seto.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/setuo.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sext-eliminate.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sext-in-reg.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-copy.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgprcopies.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/shift-i128.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/shift-select.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl.v2i16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl_add.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl_add_constant.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl_or.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-spill-cf.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-vector-hang.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sibling-call.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/sign_extend.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/sink-image-sample.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/sint_to_fp.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sitofp.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/skip-if-dead.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/smed3.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sminmax.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/smrd-gfx10.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/smrd.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sopk-compares.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-agpr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-m0.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-vgpr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/split-smrd.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sra.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/srl.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/ssubo.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/stack-realign.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-barrier.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-global.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-hi16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-local.128.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-local.96.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-local.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-private.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-v3i64.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub.i16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub.v2i16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub_i1.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub_u64.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/swdev373493.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/switch-default-block-unreachable.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/switch-unreachable.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/swizzle.bit.extract.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/target-cpu.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/trap-abis.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/trap.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/trunc-combine.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/trunc-store-vec-i16-to-i8.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/trunc.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddo.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/udiv.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/udiv64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/udivrem.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/udivrem24.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/uint_to_fp.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/uitofp.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/umed3.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-cfg.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-crash.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/unknown-processor.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/unpack-half.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/unsupported-calls.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/unsupported-cs-chain.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/unsupported-image-a16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/urem.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/urem64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubo.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/v1024.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_cndmask.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_mac.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_mac_f16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_madak_f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_pack.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_swap_b16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/valu-i1.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-alloca.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vectorize-loads.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/vop-shrink.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/vopc_dpp.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vselect.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/wait-xcnt.mir42
-rw-r--r--llvm/test/CodeGen/AMDGPU/wait.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave32.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/while-break.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir448
-rw-r--r--llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll2414
-rw-r--r--llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir902
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir1430
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wqm.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/write_register.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/wwm-reserved.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/xnor.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/xor3.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/xor_add.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/zero_extend.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll2
-rw-r--r--llvm/test/CodeGen/ARM/bad-constraint.ll25
-rw-r--r--llvm/test/CodeGen/ARM/inlineasm-int-to-float.ll17
-rw-r--r--llvm/test/CodeGen/ARM/stack-protector-eh-sjlj.ll164
-rw-r--r--llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll19
-rw-r--r--llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll19
-rw-r--r--llvm/test/CodeGen/AVR/jmp.ll3
-rw-r--r--llvm/test/CodeGen/AVR/llvm.sincos.ll883
-rw-r--r--llvm/test/CodeGen/DirectX/ShaderFlags/rawbuffer-doubles.ll37
-rw-r--r--llvm/test/CodeGen/DirectX/ShaderFlags/rawbuffer-int64.ll36
-rw-r--r--llvm/test/CodeGen/DirectX/ShaderFlags/rawbuffer-low-precision.ll44
-rw-r--r--llvm/test/CodeGen/DirectX/UAddc.ll8
-rw-r--r--llvm/test/CodeGen/DirectX/bugfix_150050_data_scalarize_const_gep.ll80
-rw-r--r--llvm/test/CodeGen/DirectX/issue-145408-gep-struct-fix.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll59
-rw-r--r--llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir50
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/build-vector.ll266
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/fpowi.ll96
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll11
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll8
-rw-r--r--llvm/test/CodeGen/LoongArch/llvm.exp10.ll13
-rw-r--r--llvm/test/CodeGen/LoongArch/llvm.sincos.ll30
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/build-vector.ll147
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/fpowi.ll49
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll8
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll15
-rw-r--r--llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll10
-rw-r--r--llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll2
-rw-r--r--llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll1
-rw-r--r--llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll1
-rw-r--r--llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll1
-rw-r--r--llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir4
-rw-r--r--llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll4
-rw-r--r--llvm/test/CodeGen/MSP430/llvm.exp10.ll198
-rw-r--r--llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll102
-rw-r--r--llvm/test/CodeGen/NVPTX/extractelement.ll71
-rw-r--r--llvm/test/CodeGen/NVPTX/i1-select.ll30
-rw-r--r--llvm/test/CodeGen/NVPTX/i128.ll582
-rw-r--r--llvm/test/CodeGen/NVPTX/i8x4-instructions.ll133
-rw-r--r--llvm/test/CodeGen/NVPTX/pr126337.ll2
-rw-r--r--llvm/test/CodeGen/NVPTX/tanhf.ll40
-rw-r--r--llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py14
-rw-r--r--llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py4
-rw-r--r--llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py4
-rw-r--r--llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py4
-rw-r--r--llvm/test/CodeGen/NVPTX/wmma.py129
-rw-r--r--llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll417
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll39
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll16
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll16
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir3
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll116
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll16
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll31
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll6
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll24
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll132
-rw-r--r--llvm/test/CodeGen/RISCV/abds-neg.ll8
-rw-r--r--llvm/test/CodeGen/RISCV/abds.ll94
-rw-r--r--llvm/test/CodeGen/RISCV/addimm-mulimm.ll12
-rw-r--r--llvm/test/CodeGen/RISCV/aext-to-sext.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/atomic-signext.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/attributes.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/bfloat-convert.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll40
-rw-r--r--llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll96
-rw-r--r--llvm/test/CodeGen/RISCV/div-by-constant.ll20
-rw-r--r--llvm/test/CodeGen/RISCV/double-convert-strict.ll18
-rw-r--r--llvm/test/CodeGen/RISCV/double-convert.ll18
-rw-r--r--llvm/test/CodeGen/RISCV/float-convert-strict.ll32
-rw-r--r--llvm/test/CodeGen/RISCV/float-convert.ll32
-rw-r--r--llvm/test/CodeGen/RISCV/fpclamptosat.ll88
-rw-r--r--llvm/test/CodeGen/RISCV/half-convert-strict.ll42
-rw-r--r--llvm/test/CodeGen/RISCV/half-convert.ll60
-rw-r--r--llvm/test/CodeGen/RISCV/iabs.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/interrupt-attr.ll5616
-rw-r--r--llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll38
-rw-r--r--llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/machine-combiner.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/memcmp-optsize.ll8
-rw-r--r--llvm/test/CodeGen/RISCV/memcmp.ll8
-rw-r--r--llvm/test/CodeGen/RISCV/mul.ll12
-rw-r--r--llvm/test/CodeGen/RISCV/neg-abs.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/overflow-intrinsics.ll8
-rw-r--r--llvm/test/CodeGen/RISCV/pr145360.ll8
-rw-r--r--llvm/test/CodeGen/RISCV/pr148084.ll279
-rw-r--r--llvm/test/CodeGen/RISCV/prefer-w-inst.mir4
-rw-r--r--llvm/test/CodeGen/RISCV/rotl-rotr.ll96
-rw-r--r--llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll36
-rw-r--r--llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll6
-rw-r--r--llvm/test/CodeGen/RISCV/rv64xtheadbb.ll48
-rw-r--r--llvm/test/CodeGen/RISCV/rv64zba.ll6
-rw-r--r--llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll16
-rw-r--r--llvm/test/CodeGen/RISCV/rv64zbb.ll62
-rw-r--r--llvm/test/CodeGen/RISCV/rv64zbkb.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll144
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll8
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll195
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll7
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll80
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll344
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll12
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir50
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll35
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll429
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll6
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll6
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll83
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll23
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll5741
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vl-opt-no-prop.ll10
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll53
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vl-opt.ll128
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll8
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll8
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vxrm.mir5
-rw-r--r--llvm/test/CodeGen/RISCV/select.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/sextw-removal.ll12
-rw-r--r--llvm/test/CodeGen/RISCV/shifts.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/shl-cttz.ll12
-rw-r--r--llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll16
-rw-r--r--llvm/test/CodeGen/RISCV/srem-vector-lkk.ll52
-rw-r--r--llvm/test/CodeGen/RISCV/typepromotion-overflow.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/urem-lkk.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll44
-rw-r--r--llvm/test/CodeGen/RISCV/urem-vector-lkk.ll32
-rw-r--r--llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll1901
-rw-r--r--llvm/test/CodeGen/RISCV/xandesbfhcvt.ll45
-rw-r--r--llvm/test/CodeGen/RISCV/xqciac.ll65
-rw-r--r--llvm/test/CodeGen/RISCV/xqcisls.ll47
-rw-r--r--llvm/test/CodeGen/RISCV/xtheadfmemidx.ll128
-rw-r--r--llvm/test/CodeGen/RISCV/xtheadmemidx.ll775
-rw-r--r--llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll6
-rw-r--r--llvm/test/CodeGen/SPARC/tls-sp.ll105
-rw-r--r--llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll66
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/resource-vector-load-store.ll39
-rw-r--r--llvm/test/CodeGen/SystemZ/pr60413.ll36
-rw-r--r--llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll3
-rw-r--r--llvm/test/CodeGen/WebAssembly/memcmp-expand.ll151
-rw-r--r--llvm/test/CodeGen/WebAssembly/memory-interleave.ll1413
-rw-r--r--llvm/test/CodeGen/WebAssembly/ref-test-func.ll146
-rw-r--r--llvm/test/CodeGen/WebAssembly/removed-terminator.ll26
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-conversions.ll28
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll10
-rw-r--r--llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll14
-rw-r--r--llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll34
-rw-r--r--llvm/test/CodeGen/X86/abds-neg.ll92
-rw-r--r--llvm/test/CodeGen/X86/avg.ll177
-rw-r--r--llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll2
-rw-r--r--llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll1
-rw-r--r--llvm/test/CodeGen/X86/conditional-tailcall.ll1
-rw-r--r--llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll39
-rw-r--r--llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll12
-rw-r--r--llvm/test/CodeGen/X86/freeze-vector.ll24
-rw-r--r--llvm/test/CodeGen/X86/noreturn-call-win64.ll12
-rw-r--r--llvm/test/CodeGen/X86/peephole-copy.mir8
-rw-r--r--llvm/test/CodeGen/X86/pr149841.ll34
-rw-r--r--llvm/test/CodeGen/X86/pr62286.ll8
-rw-r--r--llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll47
-rw-r--r--llvm/test/CodeGen/X86/seh-catch-all.ll2
-rw-r--r--llvm/test/CodeGen/X86/seh-catchpad.ll10
-rw-r--r--llvm/test/CodeGen/X86/seh-except-finally.ll6
-rw-r--r--llvm/test/CodeGen/X86/seh-finally.ll2
-rw-r--r--llvm/test/CodeGen/X86/seh-safe-div.ll5
-rw-r--r--llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll4
-rw-r--r--llvm/test/CodeGen/X86/select-optimize.ll6
-rw-r--r--llvm/test/CodeGen/X86/setcc-non-simple-type.ll4
-rw-r--r--llvm/test/CodeGen/X86/stack-coloring-wineh.ll2
-rw-r--r--llvm/test/CodeGen/X86/swap.ll9
-rw-r--r--llvm/test/CodeGen/X86/taildup-heapallocsite.ll2
-rw-r--r--llvm/test/CodeGen/X86/vec_extract.ll66
-rw-r--r--llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll8
-rw-r--r--llvm/test/CodeGen/X86/win-catchpad.ll8
-rw-r--r--llvm/test/CodeGen/X86/win-cleanuppad.ll2
-rw-r--r--llvm/test/CodeGen/X86/win32-eh-states.ll14
-rw-r--r--llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll2
-rw-r--r--llvm/test/CodeGen/X86/wineh-coreclr.ll14
-rw-r--r--llvm/test/CodeGen/XCore/exception.ll2
-rw-r--r--llvm/test/DebugInfo/Generic/mixed-source.ll58
-rw-r--r--llvm/test/ExecutionEngine/RuntimeDyld/LoongArch/ELF_LoongArch_relocations.s42
-rw-r--r--llvm/test/FileCheck/long-check.txt9
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll26
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/lifetime-throw.ll8
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/lifetime.ll174
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll12
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll46
-rw-r--r--llvm/test/Instrumentation/MemorySanitizer/alloca.ll73
-rw-r--r--llvm/test/MC/AMDGPU/gfx10_asm_vop3.s24
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_smem.s27
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_smem_err.s16
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf_err.s6
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s416
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s59
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s356
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s8
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s45
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s45
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s59
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s59
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8-fake16.s19
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8.s19
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s1483
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3p_alias.s5
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s65
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s76
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_err.s20
-rw-r--r--llvm/test/MC/AMDGPU/gfx7_err_pos.s13
-rw-r--r--llvm/test/MC/AMDGPU/gfx8_err_pos.s10
-rw-r--r--llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s24
-rw-r--r--llvm/test/MC/AVR/inst-brbc.s6
-rw-r--r--llvm/test/MC/AVR/inst-brbs.s6
-rw-r--r--llvm/test/MC/AVR/inst-brcc.s12
-rw-r--r--llvm/test/MC/AVR/inst-brcs.s12
-rw-r--r--llvm/test/MC/AVR/inst-breq.s11
-rw-r--r--llvm/test/MC/AVR/inst-brge.s9
-rw-r--r--llvm/test/MC/AVR/inst-brhc.s9
-rw-r--r--llvm/test/MC/AVR/inst-brhs.s9
-rw-r--r--llvm/test/MC/AVR/inst-brid.s9
-rw-r--r--llvm/test/MC/AVR/inst-brie.s9
-rw-r--r--llvm/test/MC/AVR/inst-brlo.s9
-rw-r--r--llvm/test/MC/AVR/inst-brlt.s9
-rw-r--r--llvm/test/MC/AVR/inst-brmi.s9
-rw-r--r--llvm/test/MC/AVR/inst-brne.s11
-rw-r--r--llvm/test/MC/AVR/inst-brpl.s9
-rw-r--r--llvm/test/MC/AVR/inst-brsh.s9
-rw-r--r--llvm/test/MC/AVR/inst-brtc.s9
-rw-r--r--llvm/test/MC/AVR/inst-brts.s9
-rw-r--r--llvm/test/MC/AVR/inst-brvc.s9
-rw-r--r--llvm/test/MC/AVR/inst-brvs.s9
-rw-r--r--llvm/test/MC/AVR/inst-rcall.s13
-rw-r--r--llvm/test/MC/AVR/inst-rjmp.s26
-rw-r--r--llvm/test/MC/COFF/bss-text.s12
-rw-r--r--llvm/test/MC/COFF/section.s2
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt24
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt12
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt321
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt258
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt46
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp16.txt45
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp8.txt15
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt1033
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt39
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt24
-rw-r--r--llvm/test/MC/ELF/AArch64/cfi.s4
-rw-r--r--llvm/test/MC/ELF/cfi.s4
-rw-r--r--llvm/test/MC/ELF/mc-dump.s8
-rw-r--r--llvm/test/MC/ELF/nobits-non-zero-value.s41
-rw-r--r--llvm/test/MC/ELF/section-sym-err.s7
-rw-r--r--llvm/test/MC/ELF/section-sym-err2.s6
-rw-r--r--llvm/test/MC/ELF/section-sym2.s39
-rw-r--r--llvm/test/MC/RISCV/Relocations/mc-dump.s14
-rw-r--r--llvm/test/MC/RISCV/attribute-arch.s2
-rw-r--r--llvm/test/MC/RISCV/rv32p-valid.s4
-rw-r--r--llvm/test/MC/RISCV/rv64p-valid.s6
-rw-r--r--llvm/test/MC/X86/intel-syntax-parentheses.s10
-rw-r--r--llvm/test/Other/new-pm-print-pipeline.ll2
-rw-r--r--llvm/test/TableGen/SDNodeInfoEmitter/advanced.td (renamed from llvm/test/TableGen/SDNodeInfoEmitter/basic.td)97
-rw-r--r--llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints-1.td29
-rw-r--r--llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints-2.td (renamed from llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints.td)37
-rw-r--r--llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td50
-rw-r--r--llvm/test/TableGen/SDNodeInfoEmitter/trivial-node.td34
-rw-r--r--llvm/test/ThinLTO/X86/memprof-basic.ll6
-rw-r--r--llvm/test/ThinLTO/X86/memprof-icp.ll1
-rw-r--r--llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll106
-rw-r--r--llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll901
-rw-r--r--llvm/test/Transforms/Attributor/heap_to_stack.ll20
-rw-r--r--llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll21
-rw-r--r--llvm/test/Transforms/Attributor/memory_locations.ll160
-rw-r--r--llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll4
-rw-r--r--llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll7
-rw-r--r--llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll6
-rw-r--r--llvm/test/Transforms/DCE/basic.ll42
-rw-r--r--llvm/test/Transforms/DeadStoreElimination/libcalls.ll13
-rw-r--r--llvm/test/Transforms/DeadStoreElimination/lifetime.ll12
-rw-r--r--llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll7
-rw-r--r--llvm/test/Transforms/EarlyCSE/memoryssa.ll25
-rw-r--r--llvm/test/Transforms/GVN/assume.ll2
-rw-r--r--llvm/test/Transforms/GVN/basic.ll2
-rw-r--r--llvm/test/Transforms/GVN/lifetime-simple.ll20
-rw-r--r--llvm/test/Transforms/GVN/nonescaping.ll2
-rw-r--r--llvm/test/Transforms/GVN/opt-remarks.ll3
-rw-r--r--llvm/test/Transforms/GVN/phi.ll2
-rw-r--r--llvm/test/Transforms/GVN/pr14166.ll2
-rw-r--r--llvm/test/Transforms/GVN/pre-compare.ll2
-rw-r--r--llvm/test/Transforms/GVN/readattrs.ll2
-rw-r--r--llvm/test/Transforms/GVN/setjmp.ll2
-rw-r--r--llvm/test/Transforms/GVN/tbaa.ll2
-rw-r--r--llvm/test/Transforms/GVN/vscale.ll2
-rw-r--r--llvm/test/Transforms/GVNSink/lifetime.ll77
-rw-r--r--llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-0.ll15
-rw-r--r--llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-1.ll15
-rw-r--r--llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-2.ll15
-rw-r--r--llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-count.ll14
-rw-r--r--llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-type.ll13
-rw-r--r--llvm/test/Transforms/HipStdPar/global-var-indirection.ll110
-rw-r--r--llvm/test/Transforms/HipStdPar/global-var.ll4
-rw-r--r--llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll5
-rw-r--r--llvm/test/Transforms/InferAddressSpaces/AMDGPU/lifetime.ll12
-rw-r--r--llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll15
-rw-r--r--llvm/test/Transforms/Inline/alloca-bonus.ll5
-rw-r--r--llvm/test/Transforms/Inline/inlined-mustprogress-loop-metadata.ll449
-rw-r--r--llvm/test/Transforms/Inline/redundant-loads.ll3
-rw-r--r--llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll158
-rw-r--r--llvm/test/Transforms/InstCombine/abs-intrinsic.ll2
-rw-r--r--llvm/test/Transforms/InstCombine/deadcode.ll5
-rw-r--r--llvm/test/Transforms/InstCombine/getelementptr.ll6
-rw-r--r--llvm/test/Transforms/InstCombine/icmp-gep.ll276
-rw-r--r--llvm/test/Transforms/InstCombine/malloc-free.ll2
-rw-r--r--llvm/test/Transforms/InstCombine/pr150338.ll16
-rw-r--r--llvm/test/Transforms/InstCombine/scalable-vector-struct.ll4
-rw-r--r--llvm/test/Transforms/InstCombine/sub-gep.ll128
-rw-r--r--llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll646
-rw-r--r--llvm/test/Transforms/InstSimplify/exp10.ll9
-rw-r--r--llvm/test/Transforms/InstSimplify/fold-intrinsics.ll324
-rw-r--r--llvm/test/Transforms/LICM/PR116813-memoryssa-outdated.ll2
-rw-r--r--llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll28
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll27
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll395
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll74
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll37
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll146
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll460
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll69
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll69
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll690
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll1879
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll96
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll16
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll29
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll445
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll70
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll80
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll18
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll22
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll18
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll307
-rw-r--r--llvm/test/Transforms/Mem2Reg/alloca_addrspace.ll5
-rw-r--r--llvm/test/Transforms/Mem2Reg/ignore-droppable.ll4
-rw-r--r--llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll5
-rw-r--r--llvm/test/Transforms/MemCpyOpt/lifetime.ll19
-rw-r--r--llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll18
-rw-r--r--llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll23
-rw-r--r--llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll15
-rw-r--r--llvm/test/Transforms/MemProfContextDisambiguation/basic.ll6
-rw-r--r--llvm/test/Transforms/MoveAutoInit/clobber.ll22
-rw-r--r--llvm/test/Transforms/NewGVN/lifetime-simple.ll6
-rw-r--r--llvm/test/Transforms/NewGVN/salvage-eliminate-instruction.ll45
-rw-r--r--llvm/test/Transforms/NewGVN/salvage-trivially-dead-inst.ll33
-rw-r--r--llvm/test/Transforms/NewGVN/verify-memoryphi.ll6
-rw-r--r--llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll8
-rw-r--r--llvm/test/Transforms/ObjCARC/test_autorelease_pool.ll319
-rw-r--r--llvm/test/Transforms/PGOProfile/icp_mismatch_msg.ll6
-rw-r--r--llvm/test/Transforms/PGOProfile/indirect_call_promotion2.ll154
-rw-r--r--llvm/test/Transforms/PGOProfile/prof-verify-as-needed.ll20
-rw-r--r--llvm/test/Transforms/PGOProfile/prof-verify-existing.ll21
-rw-r--r--llvm/test/Transforms/PGOProfile/prof-verify.ll19
-rw-r--r--llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll6
-rw-r--r--llvm/test/Transforms/SCCP/uscmp.ll185
-rw-r--r--llvm/test/Transforms/SROA/alloca-address-space.ll4
-rw-r--r--llvm/test/Transforms/SROA/basictest.ll3
-rw-r--r--llvm/test/Transforms/SROA/ignore-droppable.ll4
-rw-r--r--llvm/test/Transforms/SafeStack/X86/coloring2.ll37
-rw-r--r--llvm/test/Transforms/Scalarizer/extractvalue-struct-of-vectors.ll23
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll24
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll6
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll33
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll10
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/guards.ll34
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll91
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/invalidate-block-and-loop-dispositions.ll26
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll52
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-select.ll115
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll193
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch-loop-and-block-dispositions.ll100
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll74
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll49
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/update-scev-3.ll76
-rw-r--r--llvm/test/Transforms/SimplifyCFG/X86/empty-cleanuppad.ll3
-rw-r--r--llvm/test/Transforms/SimplifyCFG/invoke_unwind_lifetime.ll4
-rw-r--r--llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll262
-rw-r--r--llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll165
-rw-r--r--llvm/test/Verifier/amdgpu-cc.ll33
-rw-r--r--llvm/test/Verifier/intrinsic-immarg.ll6
-rw-r--r--llvm/test/Verifier/opaque-ptr.ll6
-rw-r--r--llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips64_eh.ll.expected6
-rw-r--r--llvm/test/tools/llvm-exegesis/RISCV/set-reg-init-check.s7
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s2898
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s2306
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-comparison.s1762
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-conversion.s226
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fma.s1126
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-minmax.s706
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s1770
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s530
-rw-r--r--llvm/test/tools/llvm-objdump/ELF/ARM/debug-vars-dwarf4.s4
-rw-r--r--llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc10
-rw-r--r--llvm/test/tools/llvm-objdump/X86/debug-inlined-functions.s643
-rw-r--r--llvm/test/tools/llvm-readobj/ELF/sframe-header.test148
-rw-r--r--llvm/tools/bugpoint/bugpoint.cpp3
-rw-r--r--llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp6
-rw-r--r--llvm/tools/llvm-objdump/ObjdumpOpts.td16
-rw-r--r--llvm/tools/llvm-objdump/SourcePrinter.cpp238
-rw-r--r--llvm/tools/llvm-objdump/SourcePrinter.h102
-rw-r--r--llvm/tools/llvm-objdump/llvm-objdump.cpp144
-rw-r--r--llvm/tools/llvm-objdump/llvm-objdump.h7
-rw-r--r--llvm/tools/llvm-readobj/ELFDumper.cpp129
-rw-r--r--llvm/tools/llvm-readobj/ObjDumper.cpp6
-rw-r--r--llvm/tools/llvm-readobj/ObjDumper.h5
-rw-r--r--llvm/tools/llvm-readobj/Opts.td2
-rw-r--r--llvm/tools/llvm-readobj/llvm-readobj.cpp4
-rw-r--r--llvm/unittests/Analysis/ValueTrackingTest.cpp28
-rw-r--r--llvm/unittests/Frontend/OpenMPDecompositionTest.cpp16
-rw-r--r--llvm/unittests/IR/DebugInfoTest.cpp13
-rw-r--r--llvm/unittests/Support/CMakeLists.txt1
-rw-r--r--llvm/unittests/Support/DebugLogTest.cpp77
-rw-r--r--llvm/unittests/Transforms/Utils/LocalTest.cpp23
-rw-r--r--llvm/utils/TableGen/CompressInstEmitter.cpp68
-rw-r--r--llvm/utils/UpdateTestChecks/asm.py1
-rw-r--r--llvm/utils/gn/build/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/llvm/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn2
-rw-r--r--llvm/utils/gn/secondary/libcxx/include/BUILD.gn3
-rw-r--r--llvm/utils/gn/secondary/libcxx/src/BUILD.gn5
-rw-r--r--llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn5
-rw-r--r--llvm/utils/gn/secondary/lldb/test/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/llvm/lib/Object/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn1
-rwxr-xr-xllvm/utils/update_mir_regclass_numbers27
-rwxr-xr-xllvm/utils/update_mir_test_checks.py7
2461 files changed, 73420 insertions, 44313 deletions
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 87d2a9a..ec6228d 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -532,7 +532,7 @@ Kostya Serebryany ([kcc](https://github.com/kcc)) -- Sanitizers \
Michael Spencer (bigcheesegs@gmail.com), [Bigcheese](https://github.com/Bigcheese)) -- Windows support in object tools \
Alexei Starovoitov (alexei.starovoitov@gmail.com, [4ast](https://github.com/4ast)) -- BPF backend \
Evgeniy Stepanov ([eugenis](https://github.com/eugenis)) -- Sanitizers \
-Zheng Chen (czhengsz@cn.ibm.com, [chenzheng1030](https://github.com/chenzheng1030)) -- PowerPC backend
+Zheng Chen (czhengsz@cn.ibm.com, [chenzheng1030](https://github.com/chenzheng1030)) -- PowerPC backend \
Dan Gohman (llvm@sunfishcode.online, [sunfishcode](https://github.com/sunfishcode)) -- WebAssembly backend
### Former maintainers of removed components
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index c5b9bd9..d13f95b 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -677,7 +677,7 @@ the device used to execute the code match the features enabled when
generating the code. A mismatch of features may result in incorrect
execution, or a reduction in performance.
-The target features supported by each processor is listed in
+The target features supported by each processor are listed in
:ref:`amdgpu-processors`.
Target features are controlled by exactly one of the following Clang
@@ -783,7 +783,7 @@ description. The AMDGPU target specific information is:
Is an AMDGPU processor or alternative processor name specified in
:ref:`amdgpu-processor-table`. The non-canonical form target ID allows both
the primary processor and alternative processor names. The canonical form
- target ID only allow the primary processor name.
+ target ID only allows the primary processor name.
**target-feature**
Is a target feature name specified in :ref:`amdgpu-target-features-table` that
@@ -793,7 +793,7 @@ description. The AMDGPU target specific information is:
``--offload-arch``. Each target feature must appear at most once in a target
ID. The non-canonical form target ID allows the target features to be
specified in any order. The canonical form target ID requires the target
- features to be specified in alphabetic order.
+ features to be specified in alphabetical order.
.. _amdgpu-target-id-v2-v3:
@@ -886,7 +886,7 @@ supported for the ``amdgcn`` target.
setup (see :ref:`amdgpu-amdhsa-kernel-prolog-m0`).
To convert between a private or group address space address (termed a segment
- address) and a flat address the base address of the corresponding aperture
+ address) and a flat address, the base address of the corresponding aperture
can be used. For GFX7-GFX8 these are available in the
:ref:`amdgpu-amdhsa-hsa-aql-queue` the address of which can be obtained with
Queue Ptr SGPR (see :ref:`amdgpu-amdhsa-initial-kernel-execution-state`). For
@@ -1186,7 +1186,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
:ref:`llvm.stackrestore.p5 <int_stackrestore>` Implemented, must use the alloca address space.
:ref:`llvm.get.fpmode.i32 <int_get_fpmode>` The natural floating-point mode type is i32. This
- implemented by extracting relevant bits out of the MODE
+ is implemented by extracting relevant bits out of the MODE
register with s_getreg_b32. The first 10 bits are the
core floating-point mode. Bits 12:18 are the exception
mask. On gfx9+, bit 23 is FP16_OVFL. Bitfields not
@@ -1266,14 +1266,14 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
llvm.amdgcn.permlane16 Provides direct access to v_permlane16_b32. Performs arbitrary gather-style
operation within a row (16 contiguous lanes) of the second input operand.
- The third and fourth inputs must be scalar values. these are combined into
+ The third and fourth inputs must be scalar values. These are combined into
a single 64-bit value representing lane selects used to swizzle within each
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>,
<2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.
llvm.amdgcn.permlanex16 Provides direct access to v_permlanex16_b32. Performs arbitrary gather-style
operation across two rows of the second input operand (each row is 16 contiguous
- lanes). The third and fourth inputs must be scalar values. these are combined
+ lanes). The third and fourth inputs must be scalar values. These are combined
into a single 64-bit value representing lane selects used to swizzle within each
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>,
<2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.
@@ -1285,31 +1285,31 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
32-bit vectors.
llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which
- support such instructions. This performs unsigned dot product
+ support such instructions. This performs an unsigned dot product
with two v2i16 operands, summed with the third i32 operand. The
i1 fourth operand is used to clamp the output.
llvm.amdgcn.udot4 Provides direct access to v_dot4_u32_u8 across targets which
- support such instructions. This performs unsigned dot product
+ support such instructions. This performs an unsigned dot product
with two i32 operands (holding a vector of 4 8bit values), summed
with the third i32 operand. The i1 fourth operand is used to clamp
the output.
llvm.amdgcn.udot8 Provides direct access to v_dot8_u32_u4 across targets which
- support such instructions. This performs unsigned dot product
+ support such instructions. This performs an unsigned dot product
with two i32 operands (holding a vector of 8 4bit values), summed
with the third i32 operand. The i1 fourth operand is used to clamp
the output.
llvm.amdgcn.sdot2 Provides direct access to v_dot2_i32_i16 across targets which
- support such instructions. This performs signed dot product
+ support such instructions. This performs a signed dot product
with two v2i16 operands, summed with the third i32 operand. The
i1 fourth operand is used to clamp the output.
When applicable (e.g. no clamping), this is lowered into
v_dot2c_i32_i16 for targets which support it.
llvm.amdgcn.sdot4 Provides direct access to v_dot4_i32_i8 across targets which
- support such instructions. This performs signed dot product
+ support such instructions. This performs a signed dot product
with two i32 operands (holding a vector of 4 8bit values), summed
with the third i32 operand. The i1 fourth operand is used to clamp
the output.
@@ -1321,7 +1321,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
of this instruction for gfx11 targets.
llvm.amdgcn.sdot8 Provides direct access to v_dot8_u32_u4 across targets which
- support such instructions. This performs signed dot product
+ support such instructions. This performs a signed dot product
with two i32 operands (holding a vector of 8 4bit values), summed
with the third i32 operand. The i1 fourth operand is used to clamp
the output.
@@ -1401,7 +1401,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
llvm.amdgcn.atomic.cond.sub.u32 Provides direct access to flat_atomic_cond_sub_u32, global_atomic_cond_sub_u32
and ds_cond_sub_u32 based on address space on gfx12 targets. This
- performs subtraction only if the memory value is greater than or
+ performs a subtraction only if the memory value is greater than or
equal to the data value.
llvm.amdgcn.s.barrier.signal.isfirst Provides access to the s_barrier_signal_first instruction;
@@ -1646,7 +1646,7 @@ The AMDGPU backend supports the following LLVM IR attributes.
llvm.amdgcn.queue.ptr intrinsic. Note that unlike the other ABI hint
attributes, the queue pointer may be required in situations where the
intrinsic call does not directly appear in the program. Some subtargets
- require the queue pointer for to handle some addrspacecasts, as well
+ require the queue pointer to handle some addrspacecasts, as well
as the llvm.amdgcn.is.shared, llvm.amdgcn.is.private, llvm.trap, and
llvm.debug intrinsics.
@@ -1844,6 +1844,20 @@ The AMDGPU backend supports the following calling conventions:
..TODO::
Describe.
+ ``amdgpu_gfx_whole_wave`` Used for AMD graphics targets. Functions with this calling convention
+ cannot be used as entry points. They must have an i1 as the first argument,
+ which will be mapped to the value of EXEC on entry into the function. Other
+ arguments will contain poison in their inactive lanes. Similarly, the return
+ value for the inactive lanes is poison.
+
+ The function will run with all lanes enabled, i.e. EXEC will be set to -1 in the
+ prologue and restored to its original value in the epilogue. The inactive lanes
+ will be preserved for all the registers used by the function. Active lanes only
+ will only be preserved for the callee saved registers.
+
+ In all other respects, functions with this calling convention behave like
+ ``amdgpu_gfx`` functions.
+
``amdgpu_gs`` Used for Mesa/AMDPAL geometry shaders.
..TODO::
Describe.
@@ -1933,7 +1947,7 @@ The following describes all emitted function resource usage symbols:
callees, contains an indirect call
===================================== ========= ========================================= ===============================================================================
-Futhermore, three symbols are additionally emitted describing the compilation
+Furthermore, three symbols are additionally emitted describing the compilation
unit's worst case (i.e, maxima) ``num_vgpr``, ``num_agpr``, and
``numbered_sgpr`` which may be referenced and used by the aforementioned
symbolic expressions. These three symbols are ``amdgcn.max_num_vgpr``,
@@ -6344,10 +6358,13 @@ also have to wait on all global memory operations, which is unnecessary.
:doc:`Memory Model Relaxation Annotations <MemoryModelRelaxationAnnotations>` can
be used as an optimization hint for fences to solve this problem.
-The AMDGPU backend recognizes the following tags on fences:
+The AMDGPU backend recognizes the following tags on fences to control which address
+space a fence can synchronize:
+
+- ``amdgpu-synchronize-as:local`` - for the local address space
+- ``amdgpu-synchronize-as:global``- for the global address space
-- ``amdgpu-as:local`` - fence only the local address space
-- ``amdgpu-as:global``- fence only the global address space
+Multiple tags can be used at the same time to synchronize with more than one address space.
.. note::
@@ -17934,7 +17951,7 @@ set architecture (ISA) version of the assembly program.
"AMD" and *arch* should always be equal to "AMDGPU".
By default, the assembler will derive the ISA version, *vendor*, and *arch*
-from the value of the -mcpu option that is passed to the assembler.
+from the value of the ``-mcpu`` option that is passed to the assembler.
.. _amdgpu-amdhsa-assembler-directive-amdgpu_hsa_kernel:
@@ -17958,7 +17975,7 @@ default value for all keys is 0, with the following exceptions:
- *amd_kernel_code_version_minor* defaults to 2.
- *amd_machine_kind* defaults to 1.
- *amd_machine_version_major*, *machine_version_minor*, and
- *amd_machine_version_stepping* are derived from the value of the -mcpu option
+ *amd_machine_version_stepping* are derived from the value of the ``-mcpu`` option
that is passed to the assembler.
- *kernel_code_entry_byte_offset* defaults to 256.
- *wavefront_size* defaults 6 for all targets before GFX10. For GFX10 onwards
diff --git a/llvm/docs/CIBestPractices.rst b/llvm/docs/CIBestPractices.rst
index 71fdd12..8301b95 100644
--- a/llvm/docs/CIBestPractices.rst
+++ b/llvm/docs/CIBestPractices.rst
@@ -108,3 +108,31 @@ If specific jobs within the workflow need additional permissions, those
permissions should be added within the specific job. This practice locks down
all permissions by default and only enables them when needed, better enforcing
the principle of least privilege.
+
+Ensuring Workflows Run on the Correct Events
+--------------------------------------------
+
+Github allows workflows to run on a multitude of events and it is important to
+configure a workflow such that it triggers on the correct events. There are
+two main best practices around events that trigger workflows:
+
+1. Workflows that are designed to run on pull requests should not be
+restricted by target branch. Restricting the target branch unnecessarily
+will prevent any stacked PRs from being tested. ``pull_request`` events should
+not contain a branches key.
+
+2. Workflows that are designed to also trigger on push events (e.g., for
+testing on ``main`` or one of the release branches) need to be restricted by
+branch. While pushes to a fork will not trigger a workflow run due to the
+``push`` event if the workflow already has its jobs disabled in forks
+(described above), stacked PRs will end up running jobs twice if the ``push``
+event does not have any branch restrictions. ``push`` events should have
+their branches restricted at the very least to ``main`` and the release
+branches as follows:
+
+.. code-block:: yaml
+
+ push:
+ branches:
+ - main
+ - releases/*
diff --git a/llvm/docs/CommandGuide/llvm-objdump.rst b/llvm/docs/CommandGuide/llvm-objdump.rst
index c9f0379..aaf38f8 100644
--- a/llvm/docs/CommandGuide/llvm-objdump.rst
+++ b/llvm/docs/CommandGuide/llvm-objdump.rst
@@ -140,23 +140,29 @@ OPTIONS
debug information for stripped binaries. Multiple instances of this argument
are searched in the order given.
-.. option:: --debuginfod, --no-debuginfod
+.. option:: --debug-indent=<width>
- Whether or not to try debuginfod lookups for debug binaries. Unless specified,
- debuginfod is only enabled if libcurl was compiled in (``LLVM_ENABLE_CURL``)
- and at least one server URL was provided by the environment variable
- ``DEBUGINFOD_URLS``.
+ Distance to indent the source-level variable or inlined function display,
+ relative to the start of the disassembly. Defaults to 52 characters.
+
+.. option:: --debug-inlined-funcs[=<format>]
-.. option:: --debug-vars=<format>
+ Print the locations of inlined functions alongside disassembly.
+ ``format`` may be ``ascii``, ``limits-only``, or ``unicode``, defaulting to
+ ``unicode`` if omitted.
+
+.. option:: --debug-vars[=<format>]
Print the locations (in registers or memory) of source-level variables
- alongside disassembly. ``format`` may be ``unicode`` or ``ascii``, defaulting
+ alongside disassembly. ``format`` may be ``ascii`` or ``unicode``, defaulting
to ``unicode`` if omitted.
-.. option:: --debug-vars-indent=<width>
+.. option:: --debuginfod, --no-debuginfod
- Distance to indent the source-level variable display, relative to the start
- of the disassembly. Defaults to 52 characters.
+ Whether or not to try debuginfod lookups for debug binaries. Unless specified,
+ debuginfod is only enabled if libcurl was compiled in (``LLVM_ENABLE_CURL``)
+ and at least one server URL was provided by the environment variable
+ ``DEBUGINFOD_URLS``.
.. option:: -j, --section=<section1[,section2,...]>
diff --git a/llvm/docs/DirectX/RootSignatures.rst b/llvm/docs/DirectX/RootSignatures.rst
new file mode 100644
index 0000000..e328b4a
--- /dev/null
+++ b/llvm/docs/DirectX/RootSignatures.rst
@@ -0,0 +1,245 @@
+===============
+Root Signatures
+===============
+
+.. contents::
+ :local:
+
+.. toctree::
+ :hidden:
+
+Overview
+========
+
+A root signature is used to describe what resources a shader needs access to
+and how they're organized and bound in the pipeline. The DirectX Container
+(DXContainer) contains a root signature part (RTS0), which stores this
+information in a binary format. To assist with the construction of, and
+interaction with, a root signature is represented as metadata
+(``dx.rootsignatures`` ) in the LLVM IR. The metadata can then be converted to
+its binary form, as defined in
+`llvm/include/llvm/llvm/Frontend/HLSL/RootSignatureMetadata.h
+<https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h>`_.
+This document serves as a reference for the metadata representation of a root
+signature for users to interface with.
+
+Metadata Representation
+=======================
+
+Consider the reference root signature, then the following sections describe the
+metadata representation of this root signature and the corresponding operands.
+
+.. code-block:: HLSL
+
+ RootFlags(ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT),
+ RootConstants(b0, space = 1, num32Constants = 3),
+ CBV(b1, flags = 0),
+ StaticSampler(
+ filter = FILTER_MIN_MAG_POINT_MIP_LINEAR,
+ addressU = TEXTURE_ADDRESS_BORDER,
+ ),
+ DescriptorTable(
+ visibility = VISIBILITY_ALL,
+ SRV(t0, flags = DATA_STATIC_WHILE_SET_AT_EXECUTE),
+ UAV(
+ numDescriptors = 5, u1, space = 10, offset = 5,
+ flags = DATA_VOLATILE
+ )
+ )
+
+.. note::
+
+ A root signature does not necessarily have a unique metadata representation.
+ Futher, a malformed root signature can be represented in the metadata format,
+ (eg. mixing Sampler and non-Sampler descriptor ranges), and so it is the
+ user's responsibility to verify that it is a well-formed root signature.
+
+Named Root Signature Table
+==========================
+
+.. code-block:: LLVM
+
+ !dx.rootsignatures = !{!0}
+
+A named metadata node, ``dx.rootsignatures``` is used to identify the root
+signature table. The table itself is a list of references to function/root
+signature pairs.
+
+Function/Root Signature Pair
+============================
+
+.. code-block:: LLVM
+
+ !1 = !{ptr @main, !2, i32 2 }
+
+The function/root signature associates a function (the first operand) with a
+reference to a root signature (the second operand). The root signature version
+(the third operand) used for validation logic and binary format follows.
+
+Root Signature
+==============
+
+.. code-block:: LLVM
+
+ !2 = !{ !3, !4, !5, !6, !7 }
+
+The root signature itself simply consists of a list of references to its root
+signature elements.
+
+Root Signature Element
+======================
+
+A root signature element is identified by the first operand, which is a string.
+The following root signature elements are defined:
+
+================= ======================
+Identifier String Root Signature Element
+================= ======================
+"RootFlags" Root Flags
+"RootConstants" Root Constants
+"RootCBV" Root Descriptor
+"RootSRV" Root Descriptor
+"RootUAV" Root Descriptor
+"StaticSampler" Static Sampler
+"DescriptorTable" Descriptor Table
+================= ======================
+
+Below is listed the representation for each type of root signature element.
+
+Root Flags
+==========
+
+.. code-block:: LLVM
+
+ !3 = { !"RootFlags", i32 1 }
+
+======================= ====
+Description Type
+======================= ====
+`Root Signature Flags`_ i32
+======================= ====
+
+.. _Root Signature Flags: https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ne-d3d12-d3d12_root_signature_flags
+
+Root Constants
+==============
+
+.. code-block:: LLVM
+
+ !4 = { !"RootConstants", i32 0, i32 1, i32 2, i32 3 }
+
+==================== ====
+Description Type
+==================== ====
+`Shader Visibility`_ i32
+Shader Register i32
+Register Space i32
+Number 32-bit Values i32
+==================== ====
+
+.. _Shader Visibility: https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ne-d3d12-d3d12_shader_visibility
+
+Root Descriptor
+===============
+
+As noted in the table above, the first operand will denote the type of
+root descriptor.
+
+.. code-block:: LLVM
+
+ !5 = { !"RootCBV", i32 0, i32 1, i32 0, i32 0 }
+
+======================== ====
+Description Type
+======================== ====
+`Shader Visibility`_ i32
+Shader Register i32
+Register Space i32
+`Root Descriptor Flags`_ i32
+======================== ====
+
+.. _Root Descriptor Flags: https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ne-d3d12-d3d12_root_descriptor_flags
+
+Static Sampler
+==============
+
+.. code-block:: LLVM
+
+ !6 = !{ !"StaticSampler", i32 1, i32 4, ... }; remaining operands omitted for space
+
+==================== =====
+Description Type
+==================== =====
+`Filter`_ i32
+`AddressU`_ i32
+`AddressV`_ i32
+`AddressW`_ i32
+MipLODBias float
+MaxAnisotropy i32
+`ComparisonFunc`_ i32
+`BorderColor`_ i32
+MinLOD float
+MaxLOD float
+ShaderRegister i32
+RegisterSpace i32
+`Shader Visibility`_ i32
+==================== =====
+
+.. _Filter: https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ne-d3d12-d3d12_filter
+.. _AddressU: https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ne-d3d12-d3d12_texture_address_mode
+.. _AddressV: https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ne-d3d12-d3d12_texture_address_mode
+.. _AddressW: https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ne-d3d12-d3d12_texture_address_mode
+.. _ComparisonFunc: https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ne-d3d12-d3d12_comparison_func>
+.. _BorderColor: https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ne-d3d12-d3d12_static_border_color>
+
+Descriptor Table
+================
+
+A descriptor table consists of a visibility and the remaining operands are a
+list of references to its descriptor ranges.
+
+.. note::
+
+ The term Descriptor Table Clause is synonymous with Descriptor Range when
+ referencing the implementation details.
+
+.. code-block:: LLVM
+
+ !7 = { !"DescriptorTable", i32 0, !8, !9 }
+
+========================= ================
+Description Type
+========================= ================
+`Shader Visibility`_ i32
+Descriptor Range Elements Descriptor Range
+========================= ================
+
+
+Descriptor Range
+================
+
+Similar to a root descriptor, the first operand will denote the type of
+descriptor range. It is one of the following types:
+
+- "CBV"
+- "SRV"
+- "UAV"
+- "Sampler"
+
+.. code-block:: LLVM
+
+ !8 = !{ !"SRV", i32 1, i32 0, i32 0, i32 -1, i32 4 }
+ !9 = !{ !"UAV", i32 5, i32 1, i32 10, i32 5, i32 2 }
+
+============================== ====
+Description Type
+============================== ====
+Number of Descriptors in Range i32
+Shader Register i32
+Register Space i32
+`Offset`_ i32
+`Descriptor Range Flags`_ i32
+============================== ====
+
+.. _Offset: https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ns-d3d12-d3d12_descriptor_range
+.. _Descriptor Range Flags: https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ne-d3d12-d3d12_descriptor_range_flags
diff --git a/llvm/docs/DirectXUsage.rst b/llvm/docs/DirectXUsage.rst
index 4d8f49b..1d964e6 100644
--- a/llvm/docs/DirectXUsage.rst
+++ b/llvm/docs/DirectXUsage.rst
@@ -17,6 +17,7 @@ User Guide for the DirectX Target
DirectX/DXILArchitecture
DirectX/DXILOpTableGenDesign
DirectX/DXILResources
+ DirectX/RootSignatures
Introduction
============
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 372fd40..3036dae 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -12,7 +12,7 @@ Welcome to the LLVM project!
The LLVM project has multiple components. The core of the project is
itself called "LLVM". This contains all of the tools, libraries, and header
-files needed to process intermediate representations and converts it into
+files needed to process intermediate representations and convert them into
object files. Tools include an assembler, disassembler, bitcode analyzer, and
bitcode optimizer. It also contains basic regression tests.
@@ -32,11 +32,11 @@ Getting the Source Code and Building LLVM
#. Check out LLVM (including subprojects like Clang):
* ``git clone https://github.com/llvm/llvm-project.git``
- * Or, on windows:
+ * Or, on Windows:
``git clone --config core.autocrlf=false
https://github.com/llvm/llvm-project.git``
- * To save storage and speed-up the checkout time, you may want to do a
+ * To save storage and speed up the checkout time, you may want to do a
`shallow clone <https://git-scm.com/docs/git-clone#Documentation/git-clone.txt---depthltdepthgt>`_.
For example, to get the latest revision of the LLVM project, use
@@ -71,7 +71,7 @@ Getting the Source Code and Building LLVM
Some common options:
- * ``-DLLVM_ENABLE_PROJECTS='...'`` --- semicolon-separated list of the LLVM
+ * ``-DLLVM_ENABLE_PROJECTS='...'`` --- A semicolon-separated list of the LLVM
subprojects you'd like to additionally build. Can include any of: clang,
clang-tools-extra, lldb, lld, polly, or cross-project-tests.
@@ -82,10 +82,10 @@ Getting the Source Code and Building LLVM
pathname of where you want the LLVM tools and libraries to be installed
(default ``/usr/local``).
- * ``-DCMAKE_BUILD_TYPE=type`` --- Controls optimization level and debug
+ * ``-DCMAKE_BUILD_TYPE=type`` --- Controls the optimization level and debug
information of the build. Valid options for *type* are ``Debug``,
``Release``, ``RelWithDebInfo``, and ``MinSizeRel``. For more detailed
- information see :ref:`CMAKE_BUILD_TYPE <cmake_build_type>`.
+ information, see :ref:`CMAKE_BUILD_TYPE <cmake_build_type>`.
* ``-DLLVM_ENABLE_ASSERTIONS=ON`` --- Compile with assertion checks enabled
(default is ON for Debug builds, OFF for all other build types).
@@ -124,7 +124,7 @@ Getting the Source Code and Building LLVM
``ninja -C build check-llvm``
- This will setup an LLVM build with debugging info, then compile LLVM and
+ This will set up an LLVM build with debugging info, then compile LLVM and
run LLVM tests.
* For more detailed information on CMake options, see `CMake <CMake.html>`__
@@ -150,7 +150,7 @@ page.
For stand-alone builds, you must have an llvm install that is configured
properly to be consumable by stand-alone builds of the other projects.
-This could be a distro provided LLVM install, or you can build it yourself,
+This could be a distro-provided LLVM install, or you can build it yourself,
like this:
.. code-block:: console
@@ -195,7 +195,7 @@ clang clang, cmake CLANG_INCLUDE_TESTS=ON (Required for check
lld lld, cmake
============ ======================== ======================
-Example for building stand-alone `clang`:
+Example of building stand-alone `clang`:
.. code-block:: console
@@ -224,7 +224,7 @@ Example for building stand-alone `clang`:
Requirements
============
-Before you begin to use the LLVM system, review the requirements given below.
+Before you begin to use the LLVM system, review the requirements below.
This may save you some trouble by knowing ahead of time what hardware and
software you will need.
@@ -265,7 +265,7 @@ Windows on Arm ARM64 Visual Studio, Clang\ :sup:`4`
#. Code generation supported for Pentium processors and up
#. Code generation supported for 32-bit ABI only
- #. To use LLVM modules on Win32-based system, you may configure LLVM
+ #. To use LLVM modules on a Win32-based system, you may configure LLVM
with ``-DBUILD_SHARED_LIBS=On``.
#. Visual Studio alone can compile LLVM. When using Clang, you
must also have Visual Studio installed.
@@ -309,7 +309,7 @@ Package Version Notes
#. Only needed if you want to run the automated test suite in the
``llvm/test`` directory, or if you plan to utilize any Python libraries,
utilities, or bindings.
- #. Optional, adds compression / uncompression capabilities to selected LLVM
+ #. Optional, adds compression/uncompression capabilities to selected LLVM
tools.
#. Optional, you can use any other build tool supported by CMake.
#. Only needed when building libc with New Headergen. Mainly used by libc.
@@ -401,11 +401,11 @@ Studio 2019 (or later), or a recent version of mingw64. FreeBSD 10.0 and newer
have a modern Clang as the system compiler.
However, some Linux distributions and some other or older BSDs sometimes have
-extremely old versions of GCC. These steps attempt to help you upgrade you
+extremely old versions of GCC. These steps attempt to help you upgrade your
compiler even on such a system. However, if at all possible, we encourage you
to use a recent version of a distribution with a modern system compiler that
meets these requirements. Note that it is tempting to install a prior
-version of Clang and libc++ to be the host compiler, however libc++ was not
+version of Clang and libc++ to be the host compiler; however, libc++ was not
well tested or set up to build on Linux until relatively recently. As
a consequence, this guide suggests just using libstdc++ and a modern GCC as the
initial host in a bootstrap, and then using Clang (and potentially libc++).
@@ -514,11 +514,11 @@ appropriate pathname on your local system. All these paths are absolute:
``SRC_ROOT``
- This is the top level directory of the LLVM source tree.
+ This is the top-level directory of the LLVM source tree.
``OBJ_ROOT``
- This is the top level directory of the LLVM object tree (i.e. the tree where
+ This is the top-level directory of the LLVM object tree (i.e. the tree where
object files and compiled programs will be placed. It can be the same as
SRC_ROOT).
@@ -666,7 +666,7 @@ cross-compiling CMake provides a variable ``CMAKE_TOOLCHAIN_FILE`` which can
define compiler flags and variables used during the CMake test operations.
The result of such a build is executables that are not runnable on the build
-host but can be executed on the target. As an example the following CMake
+host but can be executed on the target. As an example, the following CMake
invocation can generate build files targeting iOS. This will work on macOS
with the latest Xcode:
@@ -770,7 +770,7 @@ Generates system build files.
- Some simple examples showing how to use LLVM as a compiler for a custom
language - including lowering, optimization, and code generation.
-- Kaleidoscope Tutorial: Kaleidoscope language tutorial run through the
+- Kaleidoscope Tutorial: Kaleidoscope language tutorial runs through the
implementation of a nice little compiler for a non-trivial language
including a hand-written lexer, parser, AST, as well as code generation
support using LLVM- both static (ahead of time) and various approaches to
@@ -858,7 +858,7 @@ share code among the `tools`_.
``llvm/lib/Support/``
- Source code that corresponding to the header files in ``llvm/include/ADT/``
+ Source code that corresponds to the header files in ``llvm/include/ADT/``
and ``llvm/include/Support/``.
``llvm/bindings``
@@ -1051,7 +1051,7 @@ Example with clang
% lli hello.bc
- The second examples shows how to invoke the LLVM JIT, :doc:`lli
+ The second example shows how to invoke the LLVM JIT, :doc:`lli
<CommandGuide/lli>`.
#. Use the ``llvm-dis`` utility to take a look at the LLVM assembly code:
@@ -1163,7 +1163,7 @@ following options with cmake:
Consider setting this to ``ON`` if you require a debug build, as this will ease
memory pressure on the linker. This will make linking much faster, as the
- binaries will not contain any of the debug information. Instead the debug
+ binaries will not contain any of the debug information. Instead, the debug
information is in a separate DWARF object file (with the extension ``.dwo``).
This only applies to host platforms using ELF, such as Linux.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 9a32f0c..bac13cc 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -280,9 +280,9 @@ linkage:
linkage are linked together, the two global arrays are appended
together. This is the LLVM, typesafe, equivalent of having the
system linker append together "sections" with identical names when
- .o files are linked.
+ ``.o`` files are linked.
- Unfortunately this doesn't correspond to any feature in .o files, so it
+ Unfortunately this doesn't correspond to any feature in ``.o`` files, so it
can only be used for variables like ``llvm.global_ctors`` which llvm
interprets specially.
@@ -371,7 +371,7 @@ added in the future:
This calling convention supports `tail call
optimization <CodeGenerator.html#tail-call-optimization>`_ but requires
- both the caller and callee are using it.
+ both the caller and callee to use it.
"``cc 11``" - The HiPE calling convention
This calling convention has been implemented specifically for use by
the `High-Performance Erlang
@@ -447,7 +447,7 @@ added in the future:
R11. R11 can be used as a scratch register. Furthermore it also preserves
all floating-point registers (XMMs/YMMs).
- - On AArch64 the callee preserve all general purpose registers, except
+ - On AArch64 the callee preserves all general purpose registers, except
X0-X8 and X16-X18. Furthermore it also preserves lower 128 bits of V8-V31
SIMD floating point registers. Not allowed with ``nest``.
@@ -890,7 +890,7 @@ Syntax::
[gc] [prefix Constant] [prologue Constant] [personality Constant]
(!name !N)* { ... }
-The argument list is a comma separated sequence of arguments where each
+The argument list is a comma-separated sequence of arguments where each
argument is of the following form:
Syntax::
@@ -1011,7 +1011,7 @@ some can only be checked when producing an object file:
IFuncs
-------
-IFuncs, like as aliases, don't create any new data or func. They are just a new
+IFuncs, like aliases, don't create any new data or func. They are just a new
symbol that is resolved at runtime by calling a resolver function.
On ELF platforms, IFuncs are resolved by the dynamic linker at load time. On
@@ -1211,7 +1211,7 @@ Currently, only the following parameter attributes are defined:
the callee (for a return value).
``noext``
This indicates to the code generator that the parameter or return
- value has the high bits undefined, as for a struct in register, and
+ value has the high bits undefined, as for a struct in a register, and
therefore does not need to be sign or zero extended. This is the same
as default behavior and is only actually used (by some targets) to
validate that one of the attributes is always present.
@@ -1252,7 +1252,7 @@ Currently, only the following parameter attributes are defined:
on the stack. This implies the pointer is dereferenceable up to
the storage size of the type.
- It is not generally permissible to introduce a write to an
+ It is not generally permissible to introduce a write to a
``byref`` pointer. The pointer may have any address space and may
be read only.
@@ -1393,7 +1393,7 @@ Currently, only the following parameter attributes are defined:
storage for any other object accessible to the caller.
``captures(...)``
- This attributes restrict the ways in which the callee may capture the
+ This attribute restricts the ways in which the callee may capture the
pointer. This is not a valid attribute for return values. This attribute
applies only to the particular copy of the pointer passed in this argument.
@@ -1615,7 +1615,7 @@ Currently, only the following parameter attributes are defined:
assigning this parameter or return value to a stack slot during calling
convention lowering. The enforcement of the specified alignment is
target-dependent, as target-specific calling convention rules may override
- this value. This attribute serves the purpose of carrying language specific
+ this value. This attribute serves the purpose of carrying language-specific
alignment information that is not mapped to base types in the backend (for
example, over-alignment specification through language attributes).
@@ -1993,7 +1993,7 @@ For example:
``cold``
This attribute indicates that this function is rarely called. When
computing edge weights, basic blocks post-dominated by a cold
- function call are also considered to be cold; and, thus, given low
+ function call are also considered to be cold and, thus, given a low
weight.
.. _attr_convergent:
@@ -2892,7 +2892,7 @@ site, these bundles may contain any values that are needed by the
generated code. For more details, see :ref:`GC Transitions
<gc_transition_args>`.
-The bundle contain an arbitrary list of Values which need to be passed
+The bundle contains an arbitrary list of Values which need to be passed
to GC transition code. They will be lowered and passed as operands to
the appropriate GC_TRANSITION nodes in the selection DAG. It is assumed
that these arguments must be available before and after (but not
@@ -2903,7 +2903,7 @@ necessarily during) the execution of the callee.
Assume Operand Bundles
^^^^^^^^^^^^^^^^^^^^^^
-Operand bundles on an :ref:`llvm.assume <int_assume>` allows representing
+Operand bundles on an :ref:`llvm.assume <int_assume>` allow representing
assumptions, such as that a :ref:`parameter attribute <paramattrs>` or a
:ref:`function attribute <fnattrs>` holds for a certain value at a certain
location. Operand bundles enable assumptions that are either hard or impossible
@@ -2922,11 +2922,11 @@ restricted form:
"<tag>"([ <holds for value> [, <attribute argument>] ])
-* The tag of the operand bundle is usually the name of attribute that can be
- assumed to hold. It can also be `ignore`, this tag doesn't contain any
+* The tag of the operand bundle is usually the name of the attribute that can be
+ assumed to hold. It can also be `ignore`; this tag doesn't contain any
information and should be ignored.
-* The first argument if present is the value for which the attribute hold.
-* The second argument if present is an argument of the attribute.
+* The first argument, if present, is the value for which the attribute holds.
+* The second argument, if present, is an argument of the attribute.
If there are no arguments the attribute is a property of the call location.
@@ -2968,7 +2968,7 @@ the behavior is undefined, unless one of the following exceptions applies:
dereferenceable at later pointers, e.g. because it could have been freed.
In addition to allowing operand bundles encoding function and parameter
-attributes, an assume operand bundle my also encode a ``separate_storage``
+attributes, an assume operand bundle may also encode a ``separate_storage``
operand bundle. This has the form:
.. code-block:: llvm
@@ -3115,7 +3115,7 @@ Note that the assembly string *must* be parseable by LLVM's integrated assembler
Data Layout
-----------
-A module may specify a target specific data layout string that specifies
+A module may specify a target-specific data layout string that specifies
how data is to be laid out in memory. The syntax for the data layout is
simply:
@@ -3356,6 +3356,19 @@ behavior is undefined:
- the size of all allocated objects must be non-negative and not exceed the
largest signed integer that fits into the index type.
+Allocated objects that are created with operations recognized by LLVM (such as
+:ref:`alloca <i_alloca>`, heap allocation functions marked as such, and global
+variables) may *not* change their size. (``realloc``-style operations do not
+change the size of an existing allocated object; instead, they create a new
+allocated object. Even if the object is at the same location as the old one, old
+pointers cannot be used to access this new object.) However, allocated objects
+can also be created by means not recognized by LLVM, e.g. by directly calling
+``mmap``. Those allocated objects are allowed to grow to the right (i.e.,
+keeping the same base address, but increasing their size) while maintaining the
+validity of existing pointers, as long as they always satisfy the properties
+described above. Currently, allocated objects are not permitted to grow to the
+left or to shrink, nor can they have holes.
+
.. _objectlifetime:
Object Lifetime
@@ -3611,7 +3624,7 @@ operation may modify the memory at that address. A volatile operation
may not modify any other memory accessible by the module being compiled.
A volatile operation may not call any code in the current module.
-In general (without target specific context), the address space of a
+In general (without target-specific context), the address space of a
volatile operation may not be changed. Different address spaces may
have different trapping behavior when dereferencing an invalid
pointer.
@@ -3794,7 +3807,7 @@ If an atomic operation is marked ``syncscope("singlethread")``, it only
other operations running in the same thread (for example, in signal handlers).
If an atomic operation is marked ``syncscope("<target-scope>")``, where
-``<target-scope>`` is a target specific synchronization scope, then it is target
+``<target-scope>`` is a target-specific synchronization scope, then it is target
dependent if it *synchronizes with* and participates in the seq\_cst total
orderings of other operations.
@@ -3896,10 +3909,10 @@ Floating-Point Semantics
------------------------
This section defines the semantics for core floating-point operations on types
-that use a format specified by IEEE-745. These types are: ``half``, ``float``,
+that use a format specified by IEEE-754. These types are: ``half``, ``float``,
``double``, and ``fp128``, which correspond to the binary16, binary32, binary64,
and binary128 formats, respectively. The "core" operations are those defined in
-section 5 of IEEE-745, which all have corresponding LLVM operations.
+section 5 of IEEE-754, which all have corresponding LLVM operations.
The value returned by those operations matches that of the corresponding
IEEE-754 operation executed in the :ref:`default LLVM floating-point environment
@@ -8746,11 +8759,11 @@ framework::
The metadata encoding as lists of lists of options, as opposed to a collapsed
list of options, is chosen so that the IR encoding can use multiple option
strings to specify e.g., a single library, while still having that specifier be
-preserved as an atomic element that can be recognized by a target specific
+preserved as an atomic element that can be recognized by a target-specific
assembly writer or object file emitter.
Each individual option is required to be either a valid option for the target's
-linker, or an option that is reserved by the target specific assembly writer or
+linker, or an option that is reserved by the target-specific assembly writer or
object file emitter. No other aspect of these options is defined by the IR.
Dependent Libs Named Metadata
@@ -11928,6 +11941,9 @@ if the ``getelementptr`` has any non-zero indices, the following rules apply:
:ref:`based <pointeraliasing>` on. This means that it points into that
allocated object, or to its end. Note that the object does not have to be
live anymore; being in-bounds of a deallocated object is sufficient.
+ If the allocated object can grow, then the relevant size for being *in
+ bounds* is the maximal size the object could have while satisfying the
+ allocated object rules, not its current size.
* During the successive addition of offsets to the address, the resulting
pointer must remain *in bounds* of the allocated object at each step.
@@ -19508,7 +19524,7 @@ Semantics:
The '``llvm.set.loop.iterations.*``' intrinsics do not perform any arithmetic
on their operand. It's a hint to the backend that can use this to set up the
-hardware-loop count with a target specific instruction, usually a move of this
+hardware-loop count with a target-specific instruction, usually a move of this
value to a special register or a hardware-loop instruction.
@@ -19547,7 +19563,7 @@ Semantics:
The '``llvm.start.loop.iterations.*``' intrinsics do not perform any arithmetic
on their operand. It's a hint to the backend that can use this to set up the
-hardware-loop count with a target specific instruction, usually a move of this
+hardware-loop count with a target-specific instruction, usually a move of this
value to a special register or a hardware-loop instruction.
'``llvm.test.set.loop.iterations.*``' Intrinsic
@@ -19583,7 +19599,7 @@ Semantics:
The '``llvm.test.set.loop.iterations.*``' intrinsics do not perform any
arithmetic on their operand. It's a hint to the backend that can use this to
-set up the hardware-loop count with a target specific instruction, usually a
+set up the hardware-loop count with a target-specific instruction, usually a
move of this value to a special register or a hardware-loop instruction.
The result is the conditional value of whether the given count is not zero.
@@ -19621,7 +19637,7 @@ Semantics:
The '``llvm.test.start.loop.iterations.*``' intrinsics do not perform any
arithmetic on their operand. It's a hint to the backend that can use this to
-set up the hardware-loop count with a target specific instruction, usually a
+set up the hardware-loop count with a target-specific instruction, usually a
move of this value to a special register or a hardware-loop instruction.
The result is a pair of the input and a conditional value of whether the
given count is not zero.
@@ -26639,19 +26655,14 @@ Arguments:
The first argument is a constant integer representing the size of the
object, or -1 if it is variable sized. The second argument is a pointer
-to the object.
+to an ``alloca`` instruction.
Semantics:
""""""""""
-If ``ptr`` is a stack-allocated object and it points to the first byte of
-the object, the object is initially marked as dead.
-``ptr`` is conservatively considered as a non-stack-allocated object if
-the stack coloring algorithm that is used in the optimization pipeline cannot
-conclude that ``ptr`` is a stack-allocated object.
-
-After '``llvm.lifetime.start``', the stack object that ``ptr`` points is marked
-as alive and has an uninitialized value.
+The stack-allocated object that ``ptr`` points to is initially marked as dead.
+After '``llvm.lifetime.start``', the stack object is marked as alive and has an
+uninitialized value.
The stack object is marked as dead when either
:ref:`llvm.lifetime.end <int_lifeend>` to the alloca is executed or the
function returns.
@@ -26661,11 +26672,6 @@ After :ref:`llvm.lifetime.end <int_lifeend>` is called,
The second '``llvm.lifetime.start``' call marks the object as alive, but it
does not change the address of the object.
-If ``ptr`` is a non-stack-allocated object, it does not point to the first
-byte of the object or it is a stack object that is already alive, it simply
-fills all bytes of the object with ``poison``.
-
-
.. _int_lifeend:
'``llvm.lifetime.end``' Intrinsic
@@ -26689,24 +26695,16 @@ Arguments:
The first argument is a constant integer representing the size of the
object, or -1 if it is variable sized. The second argument is a pointer
-to the object.
+to an ``alloca`` instruction.
Semantics:
""""""""""
-If ``ptr`` is a stack-allocated object and it points to the first byte of the
-object, the object is dead.
-``ptr`` is conservatively considered as a non-stack-allocated object if
-the stack coloring algorithm that is used in the optimization pipeline cannot
-conclude that ``ptr`` is a stack-allocated object.
+The stack-allocated object that ``ptr`` points to becomes dead after the call
+to this intrinsic.
Calling ``llvm.lifetime.end`` on an already dead alloca is no-op.
-If ``ptr`` is a non-stack-allocated object or it does not point to the first
-byte of the object, it is equivalent to simply filling all bytes of the object
-with ``poison``.
-
-
'``llvm.invariant.start``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index d417de7..68490c8 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -135,7 +135,7 @@ rarely have to include this file directly).
return !L->contains(cast<Instruction>(V)->getParent());
}
- Note that you should **not** use an ``isa<>`` test followed by a ``cast<>``,
+ Note that you should **not** use an ``isa<>`` test followed by a ``cast<>``;
for that use the ``dyn_cast<>`` operator.
``dyn_cast<>``:
@@ -234,8 +234,8 @@ the ``str`` member function. See ``llvm/ADT/StringRef.h`` (`doxygen
<https://llvm.org/doxygen/StringRef_8h_source.html>`__) for more
information.
-You should rarely use the ``StringRef`` class directly, because it contains
-pointers to external memory it is not generally safe to store an instance of the
+You should rarely use the ``StringRef`` class directly. Because it contains
+pointers to external memory, it is not generally safe to store an instance of the
class (unless you know that the external storage will not be freed).
``StringRef`` is small and pervasive enough in LLVM that it should always be
passed by value.
@@ -416,14 +416,14 @@ to abort quickly at the point of failure (providing some basic diagnostic) when
invariants are broken at runtime.
The fundamental tools for handling programmatic errors are assertions and the
-llvm_unreachable function. Assertions are used to express invariant conditions,
+``llvm_unreachable`` function. Assertions are used to express invariant conditions,
and should include a message describing the invariant:
.. code-block:: c++
assert(isPhysReg(R) && "All virt regs should have been allocated already.");
-The llvm_unreachable function can be used to document areas of control flow
+The ``llvm_unreachable`` function can be used to document areas of control flow
that should never be entered if the program invariants hold:
.. code-block:: c++
@@ -598,7 +598,7 @@ semantics. For example:
}
This third form works with any type that can be assigned to from ``T&&``. This
-can be useful if the ``Expected<T>`` value needs to be stored an already-declared
+can be useful if the ``Expected<T>`` value needs to be stored in an already-declared
``std::optional<T>``. For example:
.. code-block:: c++
@@ -619,7 +619,7 @@ can be useful if the ``Expected<T>`` value needs to be stored an already-declare
All ``Error`` instances, whether success or failure, must be either checked or
moved from (via ``std::move`` or a return) before they are destructed.
-Accidentally discarding an unchecked error will cause a program abort at the
+Accidentally discarding an unchecked error will cause a program to abort at the
point where the unchecked value's destructor is run, making it easy to identify
and fix violations of this rule.
@@ -661,7 +661,7 @@ a variadic list of "handlers", each of which must be a callable type (a
function, lambda, or class with a call operator) with one argument. The
``handleErrors`` function will visit each handler in the sequence and check its
argument type against the dynamic type of the error, running the first handler
-that matches. This is the same decision process that is used decide which catch
+that matches. This is the same decision process that is used to decide which catch
clause to run for a C++ exception.
Since the list of handlers passed to ``handleErrors`` may not cover every error
@@ -869,10 +869,10 @@ T value:
}
Like the ExitOnError utility, cantFail simplifies control flow. Their treatment
-of error cases is very different however: Where ExitOnError is guaranteed to
+of error cases is very different, however: Where ExitOnError is guaranteed to
terminate the program on an error input, cantFail simply asserts that the result
is success. In debug builds this will result in an assertion failure if an error
-is encountered. In release builds the behavior of cantFail for failure values is
+is encountered. In release builds, the behavior of cantFail for failure values is
undefined. As such, care must be taken in the use of cantFail: clients must be
certain that a cantFail wrapped call really can not fail with the given
arguments.
@@ -928,7 +928,7 @@ well-formed Foo or an Error, never an object in an invalid state.
Propagating and consuming errors based on types
"""""""""""""""""""""""""""""""""""""""""""""""
-In some contexts, certain types of error are known to be benign. For example,
+In some contexts, certain types of errors are known to be benign. For example,
when walking an archive, some clients may be happy to skip over badly formatted
object files rather than terminating the walk immediately. Skipping badly
formatted objects could be achieved using an elaborate handler method, but the
@@ -956,7 +956,7 @@ type inspection method, ``isA``, and the ``consumeError`` function:
Concatenating Errors with joinErrors
""""""""""""""""""""""""""""""""""""
-In the archive walking example above ``BadFileFormat`` errors are simply
+In the archive walking example above, ``BadFileFormat`` errors are simply
consumed and ignored. If the client had wanted report these errors after
completing the walk over the archive they could use the ``joinErrors`` utility:
@@ -982,13 +982,13 @@ The ``joinErrors`` routine builds a special error type called ``ErrorList``,
which holds a list of user defined errors. The ``handleErrors`` routine
recognizes this type and will attempt to handle each of the contained errors in
order. If all contained errors can be handled, ``handleErrors`` will return
-``Error::success()``, otherwise ``handleErrors`` will concatenate the remaining
+``Error::success()``; otherwise, ``handleErrors`` will concatenate the remaining
errors and return the resulting ``ErrorList``.
Building fallible iterators and iterator ranges
"""""""""""""""""""""""""""""""""""""""""""""""
-The archive walking examples above retrieve archive members by index, however
+The archive walking examples above retrieve archive members by index; however,
this requires considerable boiler-plate for iteration and error checking. We can
clean this up by using the "fallible iterator" pattern, which supports the
following natural iteration idiom for fallible containers like Archive:
@@ -1039,7 +1039,7 @@ fallible_iterator utility which provides ``operator++`` and ``operator--``,
returning any errors via a reference passed in to the wrapper at construction
time. The fallible_iterator wrapper takes care of (a) jumping to the end of the
range on error, and (b) marking the error as checked whenever an iterator is
-compared to ``end`` and found to be inequal (in particular: this marks the
+compared to ``end`` and found to be inequal (in particular, this marks the
error as checked throughout the body of a range-based for loop), enabling early
exit from the loop without redundant error checking.
@@ -1068,7 +1068,7 @@ functions. E.g.:
Using the fallible_iterator utility allows for both natural construction of
fallible iterators (using failing ``inc`` and ``dec`` operations) and
-relatively natural use of c++ iterator/loop idioms.
+relatively natural use of C++ iterator/loop idioms.
.. _function_apis:
@@ -1175,7 +1175,7 @@ Then you can run your pass like this:
I am here!
Using the ``LLVM_DEBUG()`` macro instead of a home-brewed solution allows you to not
-have to create "yet another" command line option for the debug output for your
+have to create "yet another" command-line option for the debug output for your
pass. Note that ``LLVM_DEBUG()`` macros are disabled for non-asserts builds, so they
do not cause a performance impact at all (for the same reason, they should also
not contain side-effects!).
@@ -1349,7 +1349,7 @@ certain number of times.
The ``llvm/Support/DebugCounter.h`` (`doxygen
<https://llvm.org/doxygen/DebugCounter_8h_source.html>`__) file
provides a class named ``DebugCounter`` that can be used to create
-command line counter options that control execution of parts of your code.
+command-line counter options that control execution of parts of your code.
Define your DebugCounter like this:
@@ -1364,7 +1364,7 @@ is specified by the first argument. The name of the counter
argument, and the description used in the help is specified by the
third argument.
-Whatever code you want that control, use ``DebugCounter::shouldExecute`` to control it.
+Whatever code you want to control, use ``DebugCounter::shouldExecute`` to control it.
.. code-block:: c++
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 5591ac6..48d2ef1 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -56,37 +56,9 @@ Makes programs 10x faster by doing Special New Thing.
Changes to the LLVM IR
----------------------
-* It is no longer permitted to inspect the uses of ConstantData. Use
- count APIs will behave as if they have no uses (i.e. use_empty() is
- always true).
-
-* The `nocapture` attribute has been replaced by `captures(none)`.
-* The constant expression variants of the following instructions have been
- removed:
-
- * `mul`
-
-* Updated semantics of `llvm.type.checked.load.relative` to match that of
- `llvm.load.relative`.
-* Inline asm calls no longer accept ``label`` arguments. Use ``callbr`` instead.
-
-* Updated semantics of the `callbr` instruction to clarify that its
- 'indirect labels' are not expected to be reached by indirect (as in
- register-controlled) branch instructions, and therefore are not
- guaranteed to start with a `bti` or `endbr64` instruction, where
- those exist.
-
Changes to LLVM infrastructure
------------------------------
-* Removed support for target intrinsics being defined in the target directories
- themselves (i.e., the `TargetIntrinsicInfo` class).
-* Fix Microsoft demangling of string literals to be stricter
- (#GH129970))
-* Added the support for ``fmaximum`` and ``fminimum`` in ``atomicrmw`` instruction. The
- comparison is expected to match the behavior of ``llvm.maximum.*`` and
- ``llvm.minimum.*`` respectively.
-
Changes to building LLVM
------------------------
@@ -99,31 +71,9 @@ Changes to Interprocedural Optimizations
Changes to the AArch64 Backend
------------------------------
-* Added the `execute-only` target feature, which indicates that the generated
- program code doesn't contain any inline data, and there are no data accesses
- to code sections. On ELF targets this property is indicated by the
- `SHF_AARCH64_PURECODE` section flag.
- ([#125687](https://github.com/llvm/llvm-project/pull/125687),
- [#132196](https://github.com/llvm/llvm-project/pull/132196),
- [#133084](https://github.com/llvm/llvm-project/pull/133084))
-
Changes to the AMDGPU Backend
-----------------------------
-* Enabled the
- [FWD_PROGRESS bit](https://llvm.org/docs/AMDGPUUsage.html#code-object-v3-kernel-descriptor)
- for all GFX ISAs greater or equal to 10, for the AMDHSA OS.
-
-* Bump the default `.amdhsa_code_object_version` to 6. ROCm 6.3 is required to run any program compiled with COV6.
-
-* Add a new `amdgcn.load.to.lds` intrinsic that wraps the existing global.load.lds
-intrinsic and has the same semantics. This intrinsic allows using buffer fat pointers
-(`ptr addrspace(7)`) as arguments, allowing loads to LDS from these pointers to be
-represented in the IR without needing to use buffer resource intrinsics directly.
-This intrinsic is exposed to Clang as `__builtin_amdgcn_load_to_lds`, though
-buffer fat pointers are not yet enabled in Clang. Migration to this intrinsic is
-optional, and there are no plans to deprecate `amdgcn.global.load.lds`.
-
Changes to the ARM Backend
--------------------------
@@ -136,106 +86,27 @@ Changes to the DirectX Backend
Changes to the Hexagon Backend
------------------------------
-* The default Hexagon architecture version in ELF object files produced by
- the tools such as llvm-mc is changed to v68. This version will be set if
- the user does not provide the CPU version in the command line.
-
Changes to the LoongArch Backend
--------------------------------
-* Changing the default code model from `small` to `medium` for 64-bit.
-* Added inline asm support for the `q` constraint.
-* Added the `32s` target feature for LA32S ISA extensions.
-* Added codegen support for atomic-ops (`cmpxchg`, `max`, `min`, `umax`, `umin`) on LA32.
-* Added codegen support for the ILP32D calling convention.
-* Added several codegen and vectorization optimizations.
-
Changes to the MIPS Backend
---------------------------
-* `-mcpu=i6400` and `-mcpu=i6500` were added.
-
Changes to the PowerPC Backend
------------------------------
Changes to the RISC-V Backend
-----------------------------
-* Adds experimental assembler support for the Qualcomm uC 'Xqcilb` (Long Branch)
- extension.
-* Adds experimental assembler support for the Qualcomm uC 'Xqcili` (Load Large Immediate)
- extension.
-* Adds experimental assembler support for the Qualcomm uC 'Xqcilia` (Large Immediate Arithmetic)
- extension.
-* Adds experimental assembler support for the Qualcomm uC 'Xqcibm` (Bit Manipulation)
- extension.
-* Adds experimental assembler support for the Qualcomm uC 'Xqcibi` (Branch Immediate)
- extension.
-* Adds experimental assembler and code generation support for the Qualcomm
- 'Xqccmp' extension, which is a frame-pointer convention compatible version of
- Zcmp.
-* Added non-quadratic ``log-vrgather`` cost model for ``vrgather.vv`` instruction
-* Adds experimental assembler support for the Qualcomm uC 'Xqcisim` (Simulation Hint)
- extension.
-* Adds experimental assembler support for the Qualcomm uC 'Xqcisync` (Sync Delay)
- extension.
-* Adds experimental assembler support for the Qualcomm uC 'Xqciio` (External Input Output)
- extension.
-* Adds assembler support for the 'Zilsd` (Load/Store Pair Instructions)
- extension.
-* Adds assembler support for the 'Zclsd` (Compressed Load/Store Pair Instructions)
- extension.
-* Adds experimental assembler support for Zvqdotq.
-* Adds Support for Qualcomm's `qci-nest` and `qci-nonest` interrupt types, which
- use instructions from `Xqciint` to save and restore some GPRs during interrupt
- handlers.
-* When the experimental extension `Xqcili` is enabled, `qc.e.li` and `qc.li` may
- now be used to materialize immediates.
-* Adds assembler support for ``.option exact``, which disables automatic compression,
- and branch and linker relaxation. This can be disabled with ``.option noexact``,
- which is also the default.
-* `-mcpu=xiangshan-kunminghu` was added.
-* `-mcpu=andes-n45` and `-mcpu=andes-nx45` were added.
-* `-mcpu=andes-a45` and `-mcpu=andes-ax45` were added.
-* Adds support for the 'Ziccamoc` (Main Memory Supports Atomics in Zacas) extension, which was introduced as an optional extension of the RISC-V Profiles specification.
-* Adds experimental assembler support for SiFive CLIC CSRs, under the names
- `Zsfmclic` for the M-mode registers and `Zsfsclic` for the S-mode registers.
-* Adds Support for SiFive CLIC interrupt attributes, which automate writing CLIC
- interrupt handlers without using inline assembly.
-* Adds assembler support for the Andes `XAndesperf` (Andes Performance extension).
-* `-mcpu=sifive-p870` was added.
-* Adds assembler support for the Andes `XAndesvpackfph` (Andes Vector Packed FP16 extension).
-* Adds assembler support for the Andes `XAndesvdot` (Andes Vector Dot Product extension).
-* Adds assembler support for the standard `Q` (Quad-Precision Floating Point)
- extension.
-* Adds experimental assembler support for the SiFive Xsfmm* Attached Matrix
- Extensions.
-* `-mcpu=andes-a25` and `-mcpu=andes-ax25` were added.
-* The `Shlcofideleg` extension was added.
-* `-mcpu=sifive-x390` was added.
-* `-mtune=andes-45-series` was added.
-* Adds assembler support for the Andes `XAndesvbfhcvt` (Andes Vector BFLOAT16 Conversion extension).
-* `-mcpu=andes-ax45mpv` was added.
-* Removed -mattr=+no-rvc-hints that could be used to disable parsing and generation of RVC hints.
-* Adds assembler support for the Andes `XAndesvsintload` (Andes Vector INT4 Load extension).
-* Adds assembler support for the Andes `XAndesbfhcvt` (Andes Scalar BFLOAT16 Conversion extension).
-
Changes to the WebAssembly Backend
----------------------------------
Changes to the Windows Target
-----------------------------
-* `fp128` is now passed indirectly, meaning it uses the same calling convention
- as `i128`.
-
Changes to the X86 Backend
--------------------------
-* `fp128` will now use `*f128` libcalls on 32-bit GNU targets as well.
-* On x86-32, `fp128` and `i128` are now passed with the expected 16-byte stack
- alignment.
-
Changes to the OCaml bindings
-----------------------------
@@ -245,25 +116,6 @@ Changes to the Python bindings
Changes to the C API
--------------------
-* The following functions for creating constant expressions have been removed,
- because the underlying constant expressions are no longer supported. Instead,
- an instruction should be created using the `LLVMBuildXYZ` APIs, which will
- constant fold the operands if possible and create an instruction otherwise:
-
- * `LLVMConstMul`
- * `LLVMConstNUWMul`
- * `LLVMConstNSWMul`
-
-* Added `LLVMConstDataArray` and `LLVMGetRawDataValues` to allow creating and
- reading `ConstantDataArray` values without needing extra `LLVMValueRef`s for
- individual elements.
-
-* Added ``LLVMDIBuilderCreateEnumeratorOfArbitraryPrecision`` for creating
- debugging metadata of enumerators larger than 64 bits.
-
-* Added ``LLVMGetICmpSameSign`` and ``LLVMSetICmpSameSign`` for the `samesign`
- flag on `icmp` instructions.
-
Changes to the CodeGen infrastructure
-------------------------------------
@@ -276,59 +128,9 @@ Changes to the Debug Info
Changes to the LLVM tools
---------------------------------
-* llvm-objcopy now supports the `--update-section` flag for intermediate Mach-O object files.
-* llvm-strip now supports continuing to process files on encountering an error.
-* In llvm-objcopy/llvm-strip's ELF port, `--discard-locals` and `--discard-all` now allow and preserve symbols referenced by relocations.
- ([#47468](https://github.com/llvm/llvm-project/issues/47468))
-* llvm-addr2line now supports a `+` prefix when specifying an address.
-* Support for `SHT_LLVM_BB_ADDR_MAP` versions 0 and 1 has been dropped.
-
Changes to LLDB
---------------------------------
-* When building LLDB with Python support, the minimum version of Python is now
- 3.8.
-* LLDB now supports hardware watchpoints for AArch64 Windows targets. Windows
- does not provide API to query the number of supported hardware watchpoints.
- Therefore current implementation allows only 1 watchpoint, as tested with
- Windows 11 on the Microsoft SQ2 and Snapdragon Elite X platforms.
-* LLDB now steps through C++ thunks. This fixes an issue where previously, it
- wouldn't step into multiple inheritance virtual functions.
-* A statusline was added to command-line LLDB to show progress events and
- information about the current state of the debugger at the bottom of the
- terminal. This is on by default and can be configured using the
- `show-statusline` and `statusline-format` settings. It is not currently
- supported on Windows.
-* The `min-gdbserver-port` and `max-gdbserver-port` options have been removed
- from `lldb-server`'s platform mode. Since the changes to `lldb-server`'s port
- handling in LLDB 20, these options have had no effect.
-* LLDB now supports `process continue --reverse` when used with debug servers
- supporting reverse execution, such as [rr](https://rr-project.org).
- When using reverse execution, `process continue --forward` returns to the
- forward execution.
-* LLDB now supports RISC-V 32-bit ELF core files.
-* LLDB now supports siginfo descriptions for Linux user-space signals. User space
- signals will now have descriptions describing the method and sender.
- ```
- stop reason = SIGSEGV: sent by tkill system call (sender pid=649752, uid=2667987)
- ```
-* ELF Cores can now have their siginfo structures inspected using `thread siginfo`.
-* LLDB now uses
- [DIL](https://discourse.llvm.org/t/rfc-data-inspection-language/69893) as the
- default implementation for 'frame variable'. This should not change the
- behavior of 'frame variable' at all, at this time. To revert to using the
- old implementation use: `settings set target.experimental.use-DIL false`.
-* Disassembly of unknown instructions now produces `<unknown>` instead of
- nothing at all
-* Changed the format of opcode bytes to match llvm-objdump when disassembling
- RISC-V code with `disassemble`'s `--byte` option.
-
-
-### Changes to lldb-dap
-
-* Breakpoints can now be set for specific columns within a line.
-* Function return value is now displayed on step-out.
-
Changes to BOLT
---------------------------------
diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst
index b6dda6a..76b6b4e 100644
--- a/llvm/docs/TestingGuide.rst
+++ b/llvm/docs/TestingGuide.rst
@@ -152,12 +152,12 @@ can run the LLVM and Clang tests simultaneously using:
% make check-all
-To run the tests with Valgrind (Memcheck by default), use the ``LIT_ARGS`` make
+To run the tests with Valgrind (Memcheck by default), use the ``LIT_OPTS`` make
variable to pass the required options to lit. For example, you can use:
.. code-block:: bash
- % make check LIT_ARGS="-v --vg --vg-leak"
+ % make check LIT_OPTS="-v --vg --vg-leak"
to enable testing with valgrind and with leak checking enabled.
diff --git a/llvm/docs/YamlIO.rst b/llvm/docs/YamlIO.rst
index 7137c56..420adb8 100644
--- a/llvm/docs/YamlIO.rst
+++ b/llvm/docs/YamlIO.rst
@@ -92,7 +92,7 @@ corresponding denormalization step.
YAML I/O uses a non-invasive, traits based design. YAML I/O defines some
abstract base templates. You specialize those templates on your data types.
For instance, if you have an enumerated type FooBar you could specialize
-ScalarEnumerationTraits on that type and define the enumeration() method:
+ScalarEnumerationTraits on that type and define the ``enumeration()`` method:
.. code-block:: c++
@@ -113,7 +113,7 @@ values and the YAML string representation is only in one place.
This assures that the code for writing and parsing of YAML stays in sync.
To specify a YAML mappings, you define a specialization on
-llvm::yaml::MappingTraits.
+``llvm::yaml::MappingTraits``.
If your native data structure happens to be a struct that is already normalized,
then the specialization is simple. For example:
@@ -131,9 +131,9 @@ then the specialization is simple. For example:
};
-A YAML sequence is automatically inferred if you data type has begin()/end()
-iterators and a push_back() method. Therefore any of the STL containers
-(such as std::vector<>) will automatically translate to YAML sequences.
+A YAML sequence is automatically inferred if you data type has ``begin()``/``end()``
+iterators and a ``push_back()`` method. Therefore any of the STL containers
+(such as ``std::vector<>``) will automatically translate to YAML sequences.
Once you have defined specializations for your data types, you can
programmatically use YAML I/O to write a YAML document:
@@ -195,8 +195,8 @@ Error Handling
==============
When parsing a YAML document, if the input does not match your schema (as
-expressed in your XxxTraits<> specializations). YAML I/O
-will print out an error message and your Input object's error() method will
+expressed in your ``XxxTraits<>`` specializations). YAML I/O
+will print out an error message and your Input object's ``error()`` method will
return true. For instance the following document:
.. code-block:: yaml
@@ -265,8 +265,8 @@ operators to and from the base type. For example:
LLVM_YAML_STRONG_TYPEDEF(uint32_t, MyBarFlags)
This generates two classes MyFooFlags and MyBarFlags which you can use in your
-native data structures instead of uint32_t. They are implicitly
-converted to and from uint32_t. The point of creating these unique types
+native data structures instead of ``uint32_t``. They are implicitly
+converted to and from ``uint32_t``. The point of creating these unique types
is that you can now specify traits on them to get different YAML conversions.
Hex types
@@ -280,15 +280,15 @@ format used by the built-in integer types:
* Hex16
* Hex8
-You can use llvm::yaml::Hex32 instead of uint32_t and the only different will
+You can use ``llvm::yaml::Hex32`` instead of ``uint32_t`` and the only different will
be that when YAML I/O writes out that type it will be formatted in hexadecimal.
ScalarEnumerationTraits
-----------------------
YAML I/O supports translating between in-memory enumerations and a set of string
-values in YAML documents. This is done by specializing ScalarEnumerationTraits<>
-on your enumeration type and define an enumeration() method.
+values in YAML documents. This is done by specializing ``ScalarEnumerationTraits<>``
+on your enumeration type and define an ``enumeration()`` method.
For instance, suppose you had an enumeration of CPUs and a struct with it as
a field:
@@ -333,9 +333,9 @@ as a field type:
};
When reading YAML, if the string found does not match any of the strings
-specified by enumCase() methods, an error is automatically generated.
+specified by ``enumCase()`` methods, an error is automatically generated.
When writing YAML, if the value being written does not match any of the values
-specified by the enumCase() methods, a runtime assertion is triggered.
+specified by the ``enumCase()`` methods, a runtime assertion is triggered.
BitValue
@@ -442,10 +442,10 @@ Sometimes for readability a scalar needs to be formatted in a custom way. For
instance your internal data structure may use an integer for time (seconds since
some epoch), but in YAML it would be much nicer to express that integer in
some time format (e.g. 4-May-2012 10:30pm). YAML I/O has a way to support
-custom formatting and parsing of scalar types by specializing ScalarTraits<> on
+custom formatting and parsing of scalar types by specializing ``ScalarTraits<>`` on
your data type. When writing, YAML I/O will provide the native type and
-your specialization must create a temporary llvm::StringRef. When reading,
-YAML I/O will provide an llvm::StringRef of scalar and your specialization
+your specialization must create a temporary ``llvm::StringRef``. When reading,
+YAML I/O will provide an ``llvm::StringRef`` of scalar and your specialization
must convert that to your native data type. An outline of a custom scalar type
looks like:
@@ -482,15 +482,15 @@ literal block notation, just like the example shown below:
Second line
The YAML I/O library provides support for translating between YAML block scalars
-and specific C++ types by allowing you to specialize BlockScalarTraits<> on
+and specific C++ types by allowing you to specialize ``BlockScalarTraits<>`` on
your data type. The library doesn't provide any built-in support for block
-scalar I/O for types like std::string and llvm::StringRef as they are already
+scalar I/O for types like ``std::string`` and ``llvm::StringRef`` as they are already
supported by YAML I/O and use the ordinary scalar notation by default.
BlockScalarTraits specializations are very similar to the
ScalarTraits specialization - YAML I/O will provide the native type and your
-specialization must create a temporary llvm::StringRef when writing, and
-it will also provide an llvm::StringRef that has the value of that block scalar
+specialization must create a temporary ``llvm::StringRef`` when writing, and
+it will also provide an ``llvm::StringRef`` that has the value of that block scalar
and your specialization must convert that to your native data type when reading.
An example of a custom type with an appropriate specialization of
BlockScalarTraits is shown below:
@@ -524,7 +524,7 @@ Mappings
========
To be translated to or from a YAML mapping for your type T you must specialize
-llvm::yaml::MappingTraits on T and implement the "void mapping(IO &io, T&)"
+``llvm::yaml::MappingTraits`` on T and implement the "void mapping(IO &io, T&)"
method. If your native data structures use pointers to a class everywhere,
you can specialize on the class pointer. Examples:
@@ -585,7 +585,7 @@ No Normalization
The ``mapping()`` method is responsible, if needed, for normalizing and
denormalizing. In a simple case where the native data structure requires no
-normalization, the mapping method just uses mapOptional() or mapRequired() to
+normalization, the mapping method just uses ``mapOptional()`` or ``mapRequired()`` to
bind the struct's fields to YAML key names. For example:
.. code-block:: c++
@@ -605,11 +605,11 @@ bind the struct's fields to YAML key names. For example:
Normalization
----------------
-When [de]normalization is required, the mapping() method needs a way to access
+When [de]normalization is required, the ``mapping()`` method needs a way to access
normalized values as fields. To help with this, there is
-a template MappingNormalization<> which you can then use to automatically
+a template ``MappingNormalization<>`` which you can then use to automatically
do the normalization and denormalization. The template is used to create
-a local variable in your mapping() method which contains the normalized keys.
+a local variable in your ``mapping()`` method which contains the normalized keys.
Suppose you have native data type
Polar which specifies a position in polar coordinates (distance, angle):
@@ -629,7 +629,7 @@ is, you want the yaml to look like:
x: 10.3
y: -4.7
-You can support this by defining a MappingTraits that normalizes the polar
+You can support this by defining a ``MappingTraits`` that normalizes the polar
coordinates to x,y coordinates when writing YAML and denormalizes x,y
coordinates into polar when reading YAML.
@@ -667,47 +667,47 @@ coordinates into polar when reading YAML.
};
When writing YAML, the local variable "keys" will be a stack allocated
-instance of NormalizedPolar, constructed from the supplied polar object which
-initializes it x and y fields. The mapRequired() methods then write out the x
+instance of ``NormalizedPolar``, constructed from the supplied polar object which
+initializes it x and y fields. The ``mapRequired()`` methods then write out the x
and y values as key/value pairs.
When reading YAML, the local variable "keys" will be a stack allocated instance
-of NormalizedPolar, constructed by the empty constructor. The mapRequired
+of ``NormalizedPolar``, constructed by the empty constructor. The ``mapRequired()``
methods will find the matching key in the YAML document and fill in the x and y
-fields of the NormalizedPolar object keys. At the end of the mapping() method
-when the local keys variable goes out of scope, the denormalize() method will
+fields of the ``NormalizedPolar`` object keys. At the end of the ``mapping()`` method
+when the local keys variable goes out of scope, the ``denormalize()`` method will
automatically be called to convert the read values back to polar coordinates,
-and then assigned back to the second parameter to mapping().
+and then assigned back to the second parameter to ``mapping()``.
In some cases, the normalized class may be a subclass of the native type and
-could be returned by the denormalize() method, except that the temporary
+could be returned by the ``denormalize()`` method, except that the temporary
normalized instance is stack allocated. In these cases, the utility template
-MappingNormalizationHeap<> can be used instead. It just like
-MappingNormalization<> except that it heap allocates the normalized object
-when reading YAML. It never destroys the normalized object. The denormalize()
+``MappingNormalizationHeap<>`` can be used instead. It just like
+``MappingNormalization<>`` except that it heap allocates the normalized object
+when reading YAML. It never destroys the normalized object. The ``denormalize()``
method can this return "this".
Default values
--------------
-Within a mapping() method, calls to io.mapRequired() mean that that key is
+Within a ``mapping()`` method, calls to ``io.mapRequired()`` mean that that key is
required to exist when parsing YAML documents, otherwise YAML I/O will issue an
error.
-On the other hand, keys registered with io.mapOptional() are allowed to not
+On the other hand, keys registered with ``io.mapOptional()`` are allowed to not
exist in the YAML document being read. So what value is put in the field
for those optional keys?
There are two steps to how those optional fields are filled in. First, the
-second parameter to the mapping() method is a reference to a native class. That
+second parameter to the ``mapping()`` method is a reference to a native class. That
native class must have a default constructor. Whatever value the default
constructor initially sets for an optional field will be that field's value.
-Second, the mapOptional() method has an optional third parameter. If provided
-it is the value that mapOptional() should set that field to if the YAML document
+Second, the ``mapOptional()`` method has an optional third parameter. If provided
+it is the value that ``mapOptional()`` should set that field to if the YAML document
does not have that key.
There is one important difference between those two ways (default constructor
-and third parameter to mapOptional). When YAML I/O generates a YAML document,
-if the mapOptional() third parameter is used, if the actual value being written
+and third parameter to ``mapOptional()``). When YAML I/O generates a YAML document,
+if the ``mapOptional()`` third parameter is used, if the actual value being written
is the same as (using ==) the default value, then that key/value is not written.
@@ -715,14 +715,14 @@ Order of Keys
--------------
When writing out a YAML document, the keys are written in the order that the
-calls to mapRequired()/mapOptional() are made in the mapping() method. This
+calls to ``mapRequired()``/``mapOptional()`` are made in the ``mapping()`` method. This
gives you a chance to write the fields in an order that a human reader of
the YAML document would find natural. This may be different that the order
of the fields in the native class.
When reading in a YAML document, the keys in the document can be in any order,
-but they are processed in the order that the calls to mapRequired()/mapOptional()
-are made in the mapping() method. That enables some interesting
+but they are processed in the order that the calls to ``mapRequired()``/``mapOptional()``
+are made in the ``mapping()`` method. That enables some interesting
functionality. For instance, if the first field bound is the cpu and the second
field bound is flags, and the flags are cpu specific, you can programmatically
switch how the flags are converted to and from YAML based on the cpu.
@@ -761,7 +761,7 @@ model. Recently, we added support to YAML I/O for checking/setting the optional
tag on a map. Using this functionality it is even possible to support different
mappings, as long as they are convertible.
-To check a tag, inside your mapping() method you can use io.mapTag() to specify
+To check a tag, inside your ``mapping()`` method you can use ``io.mapTag()`` to specify
what the tag should be. This will also add that tag when writing yaml.
Validation
@@ -834,7 +834,7 @@ Sequence
========
To be translated to or from a YAML sequence for your type T you must specialize
-llvm::yaml::SequenceTraits on T and implement two methods:
+``llvm::yaml::SequenceTraits`` on T and implement two methods:
``size_t size(IO &io, T&)`` and
``T::value_type& element(IO &io, T&, size_t indx)``. For example:
@@ -846,10 +846,10 @@ llvm::yaml::SequenceTraits on T and implement two methods:
static MySeqEl &element(IO &io, MySeq &list, size_t index) { ... }
};
-The size() method returns how many elements are currently in your sequence.
-The element() method returns a reference to the i'th element in the sequence.
-When parsing YAML, the element() method may be called with an index one bigger
-than the current size. Your element() method should allocate space for one
+The ``size()`` method returns how many elements are currently in your sequence.
+The ``element()`` method returns a reference to the i'th element in the sequence.
+When parsing YAML, the ``element()`` method may be called with an index one bigger
+than the current size. Your ``element()`` method should allocate space for one
more element (using default constructor if element is a C++ object) and returns
a reference to that new allocated space.
@@ -881,10 +881,10 @@ configuration.
Utility Macros
--------------
-Since a common source of sequences is std::vector<>, YAML I/O provides macros:
-LLVM_YAML_IS_SEQUENCE_VECTOR() and LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR() which
-can be used to easily specify SequenceTraits<> on a std::vector type. YAML
-I/O does not partial specialize SequenceTraits on std::vector<> because that
+Since a common source of sequences is ``std::vector<>``, YAML I/O provides macros:
+``LLVM_YAML_IS_SEQUENCE_VECTOR()`` and ``LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR()`` which
+can be used to easily specify ``SequenceTraits<>`` on a ``std::vector`` type. YAML
+I/O does not partial specialize ``SequenceTraits`` on ``std::vector<>`` because that
would force all vectors to be sequences. An example use of the macros:
.. code-block:: c++
@@ -906,7 +906,7 @@ have need for multiple documents. The top level node in their YAML schema
will be a mapping or sequence. For those cases, the following is not needed.
But for cases where you do want multiple documents, you can specify a
trait for you document list type. The trait has the same methods as
-SequenceTraits but is named DocumentListTraits. For example:
+``SequenceTraits`` but is named ``DocumentListTraits``. For example:
.. code-block:: c++
@@ -919,7 +919,7 @@ SequenceTraits but is named DocumentListTraits. For example:
User Context Data
=================
-When an llvm::yaml::Input or llvm::yaml::Output object is created their
+When an ``llvm::yaml::Input`` or ``llvm::yaml::Output`` object is created their
constructors take an optional "context" parameter. This is a pointer to
whatever state information you might need.
@@ -927,8 +927,8 @@ For instance, in a previous example we showed how the conversion type for a
flags field could be determined at runtime based on the value of another field
in the mapping. But what if an inner mapping needs to know some field value
of an outer mapping? That is where the "context" parameter comes in. You
-can set values in the context in the outer map's mapping() method and
-retrieve those values in the inner map's mapping() method.
+can set values in the context in the outer map's ``mapping()`` method and
+retrieve those values in the inner map's ``mapping()`` method.
The context value is just a void*. All your traits which use the context
and operate on your native data types, need to agree what the context value
@@ -939,9 +939,9 @@ traits use to shared context sensitive information.
Output
======
-The llvm::yaml::Output class is used to generate a YAML document from your
+The ``llvm::yaml::Output`` class is used to generate a YAML document from your
in-memory data structures, using traits defined on your data types.
-To instantiate an Output object you need an llvm::raw_ostream, an optional
+To instantiate an Output object you need an ``llvm::raw_ostream``, an optional
context pointer and an optional wrapping column:
.. code-block:: c++
@@ -957,7 +957,7 @@ streaming as YAML is a mapping, scalar, or sequence, then Output assumes you
are generating one document and wraps the mapping output
with "``---``" and trailing "``...``".
-The WrapColumn parameter will cause the flow mappings and sequences to
+The ``WrapColumn`` parameter will cause the flow mappings and sequences to
line-wrap when they go over the supplied column. Pass 0 to completely
suppress the wrapping.
@@ -980,7 +980,7 @@ The above could produce output like:
...
On the other hand, if the top level data structure you are streaming as YAML
-has a DocumentListTraits specialization, then Output walks through each element
+has a ``DocumentListTraits`` specialization, then Output walks through each element
of your DocumentList and generates a "---" before the start of each element
and ends with a "...".
@@ -1008,9 +1008,9 @@ The above could produce output like:
Input
=====
-The llvm::yaml::Input class is used to parse YAML document(s) into your native
+The ``llvm::yaml::Input`` class is used to parse YAML document(s) into your native
data structures. To instantiate an Input
-object you need a StringRef to the entire YAML file, and optionally a context
+object you need a ``StringRef`` to the entire YAML file, and optionally a context
pointer:
.. code-block:: c++
@@ -1024,7 +1024,7 @@ the document(s). If you expect there might be multiple YAML documents in
one file, you'll need to specialize DocumentListTraits on a list of your
document type and stream in that document list type. Otherwise you can
just stream in the document type. Also, you can check if there was
-any syntax errors in the YAML be calling the error() method on the Input
+any syntax errors in the YAML be calling the ``error()`` method on the Input
object. For example:
.. code-block:: c++
diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst
index 5660802..5ebff3b 100644
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst
@@ -86,7 +86,7 @@ instead of computing "``x+3``" twice.
Unfortunately, no amount of local analysis will be able to detect and
correct this. This requires two transformations: reassociation of
-expressions (to make the add's lexically identical) and Common
+expressions (to make the adds lexically identical) and Common
Subexpression Elimination (CSE) to delete the redundant add instruction.
Fortunately, LLVM provides a broad range of optimizations that you can
use, in the form of "passes".
diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h
index ff8bdb8..fb91690 100644
--- a/llvm/include/llvm/ADT/ArrayRef.h
+++ b/llvm/include/llvm/ADT/ArrayRef.h
@@ -317,10 +317,6 @@ namespace llvm {
/// Construct an empty MutableArrayRef.
/*implicit*/ MutableArrayRef() = default;
- /// Construct an empty MutableArrayRef from std::nullopt.
- /*implicit*/ LLVM_DEPRECATED("Use {} or MutableArrayRef<T>() instead", "{}")
- MutableArrayRef(std::nullopt_t) : ArrayRef<T>() {}
-
/// Construct a MutableArrayRef from a single element.
/*implicit*/ MutableArrayRef(T &OneElt) : ArrayRef<T>(OneElt) {}
diff --git a/llvm/include/llvm/ADT/DenseMapInfo.h b/llvm/include/llvm/ADT/DenseMapInfo.h
index b850223..9d8fd89 100644
--- a/llvm/include/llvm/ADT/DenseMapInfo.h
+++ b/llvm/include/llvm/ADT/DenseMapInfo.h
@@ -51,10 +51,10 @@ inline unsigned combineHashValue(unsigned a, unsigned b) {
/// just be `void`.
template<typename T, typename Enable = void>
struct DenseMapInfo {
- //static inline T getEmptyKey();
- //static inline T getTombstoneKey();
- //static unsigned getHashValue(const T &Val);
- //static bool isEqual(const T &LHS, const T &RHS);
+ // static constexpr T getEmptyKey();
+ // static constexpr T getTombstoneKey();
+ // static unsigned getHashValue(const T &Val);
+ // static bool isEqual(const T &LHS, const T &RHS);
};
// Provide DenseMapInfo for all pointers. Come up with sentinel pointer values
@@ -70,13 +70,13 @@ struct DenseMapInfo<T*> {
// "Log2MaxAlign bits of alignment");
static constexpr uintptr_t Log2MaxAlign = 12;
- static inline T* getEmptyKey() {
+ static constexpr T *getEmptyKey() {
uintptr_t Val = static_cast<uintptr_t>(-1);
Val <<= Log2MaxAlign;
return reinterpret_cast<T*>(Val);
}
- static inline T* getTombstoneKey() {
+ static constexpr T *getTombstoneKey() {
uintptr_t Val = static_cast<uintptr_t>(-2);
Val <<= Log2MaxAlign;
return reinterpret_cast<T*>(Val);
@@ -92,8 +92,8 @@ struct DenseMapInfo<T*> {
// Provide DenseMapInfo for chars.
template<> struct DenseMapInfo<char> {
- static inline char getEmptyKey() { return ~0; }
- static inline char getTombstoneKey() { return ~0 - 1; }
+ static constexpr char getEmptyKey() { return ~0; }
+ static constexpr char getTombstoneKey() { return ~0 - 1; }
static unsigned getHashValue(const char& Val) { return Val * 37U; }
static bool isEqual(const char &LHS, const char &RHS) {
@@ -103,8 +103,8 @@ template<> struct DenseMapInfo<char> {
// Provide DenseMapInfo for unsigned chars.
template <> struct DenseMapInfo<unsigned char> {
- static inline unsigned char getEmptyKey() { return ~0; }
- static inline unsigned char getTombstoneKey() { return ~0 - 1; }
+ static constexpr unsigned char getEmptyKey() { return ~0; }
+ static constexpr unsigned char getTombstoneKey() { return ~0 - 1; }
static unsigned getHashValue(const unsigned char &Val) { return Val * 37U; }
static bool isEqual(const unsigned char &LHS, const unsigned char &RHS) {
@@ -114,8 +114,8 @@ template <> struct DenseMapInfo<unsigned char> {
// Provide DenseMapInfo for unsigned shorts.
template <> struct DenseMapInfo<unsigned short> {
- static inline unsigned short getEmptyKey() { return 0xFFFF; }
- static inline unsigned short getTombstoneKey() { return 0xFFFF - 1; }
+ static constexpr unsigned short getEmptyKey() { return 0xFFFF; }
+ static constexpr unsigned short getTombstoneKey() { return 0xFFFF - 1; }
static unsigned getHashValue(const unsigned short &Val) { return Val * 37U; }
static bool isEqual(const unsigned short &LHS, const unsigned short &RHS) {
@@ -125,8 +125,8 @@ template <> struct DenseMapInfo<unsigned short> {
// Provide DenseMapInfo for unsigned ints.
template<> struct DenseMapInfo<unsigned> {
- static inline unsigned getEmptyKey() { return ~0U; }
- static inline unsigned getTombstoneKey() { return ~0U - 1; }
+ static constexpr unsigned getEmptyKey() { return ~0U; }
+ static constexpr unsigned getTombstoneKey() { return ~0U - 1; }
static unsigned getHashValue(const unsigned& Val) { return Val * 37U; }
static bool isEqual(const unsigned& LHS, const unsigned& RHS) {
@@ -136,8 +136,8 @@ template<> struct DenseMapInfo<unsigned> {
// Provide DenseMapInfo for unsigned longs.
template<> struct DenseMapInfo<unsigned long> {
- static inline unsigned long getEmptyKey() { return ~0UL; }
- static inline unsigned long getTombstoneKey() { return ~0UL - 1L; }
+ static constexpr unsigned long getEmptyKey() { return ~0UL; }
+ static constexpr unsigned long getTombstoneKey() { return ~0UL - 1L; }
static unsigned getHashValue(const unsigned long& Val) {
if constexpr (sizeof(Val) == 4)
@@ -153,8 +153,8 @@ template<> struct DenseMapInfo<unsigned long> {
// Provide DenseMapInfo for unsigned long longs.
template<> struct DenseMapInfo<unsigned long long> {
- static inline unsigned long long getEmptyKey() { return ~0ULL; }
- static inline unsigned long long getTombstoneKey() { return ~0ULL - 1ULL; }
+ static constexpr unsigned long long getEmptyKey() { return ~0ULL; }
+ static constexpr unsigned long long getTombstoneKey() { return ~0ULL - 1ULL; }
static unsigned getHashValue(const unsigned long long& Val) {
return densemap::detail::mix(Val);
@@ -168,16 +168,16 @@ template<> struct DenseMapInfo<unsigned long long> {
// Provide DenseMapInfo for shorts.
template <> struct DenseMapInfo<short> {
- static inline short getEmptyKey() { return 0x7FFF; }
- static inline short getTombstoneKey() { return -0x7FFF - 1; }
+ static constexpr short getEmptyKey() { return 0x7FFF; }
+ static constexpr short getTombstoneKey() { return -0x7FFF - 1; }
static unsigned getHashValue(const short &Val) { return Val * 37U; }
static bool isEqual(const short &LHS, const short &RHS) { return LHS == RHS; }
};
// Provide DenseMapInfo for ints.
template<> struct DenseMapInfo<int> {
- static inline int getEmptyKey() { return 0x7fffffff; }
- static inline int getTombstoneKey() { return -0x7fffffff - 1; }
+ static constexpr int getEmptyKey() { return 0x7fffffff; }
+ static constexpr int getTombstoneKey() { return -0x7fffffff - 1; }
static unsigned getHashValue(const int& Val) { return (unsigned)(Val * 37U); }
static bool isEqual(const int& LHS, const int& RHS) {
@@ -187,11 +187,11 @@ template<> struct DenseMapInfo<int> {
// Provide DenseMapInfo for longs.
template<> struct DenseMapInfo<long> {
- static inline long getEmptyKey() {
+ static constexpr long getEmptyKey() {
return (1UL << (sizeof(long) * 8 - 1)) - 1UL;
}
- static inline long getTombstoneKey() { return getEmptyKey() - 1L; }
+ static constexpr long getTombstoneKey() { return getEmptyKey() - 1L; }
static unsigned getHashValue(const long& Val) {
return (unsigned)(Val * 37UL);
@@ -204,8 +204,10 @@ template<> struct DenseMapInfo<long> {
// Provide DenseMapInfo for long longs.
template<> struct DenseMapInfo<long long> {
- static inline long long getEmptyKey() { return 0x7fffffffffffffffLL; }
- static inline long long getTombstoneKey() { return -0x7fffffffffffffffLL-1; }
+ static constexpr long long getEmptyKey() { return 0x7fffffffffffffffLL; }
+ static constexpr long long getTombstoneKey() {
+ return -0x7fffffffffffffffLL - 1;
+ }
static unsigned getHashValue(const long long& Val) {
return (unsigned)(Val * 37ULL);
@@ -224,12 +226,12 @@ struct DenseMapInfo<std::pair<T, U>> {
using FirstInfo = DenseMapInfo<T>;
using SecondInfo = DenseMapInfo<U>;
- static inline Pair getEmptyKey() {
+ static constexpr Pair getEmptyKey() {
return std::make_pair(FirstInfo::getEmptyKey(),
SecondInfo::getEmptyKey());
}
- static inline Pair getTombstoneKey() {
+ static constexpr Pair getTombstoneKey() {
return std::make_pair(FirstInfo::getTombstoneKey(),
SecondInfo::getTombstoneKey());
}
@@ -257,11 +259,11 @@ struct DenseMapInfo<std::pair<T, U>> {
template <typename... Ts> struct DenseMapInfo<std::tuple<Ts...>> {
using Tuple = std::tuple<Ts...>;
- static inline Tuple getEmptyKey() {
+ static constexpr Tuple getEmptyKey() {
return Tuple(DenseMapInfo<Ts>::getEmptyKey()...);
}
- static inline Tuple getTombstoneKey() {
+ static constexpr Tuple getTombstoneKey() {
return Tuple(DenseMapInfo<Ts>::getTombstoneKey()...);
}
@@ -309,10 +311,22 @@ struct DenseMapInfo<Enum, std::enable_if_t<std::is_enum_v<Enum>>> {
using UnderlyingType = std::underlying_type_t<Enum>;
using Info = DenseMapInfo<UnderlyingType>;
- static Enum getEmptyKey() { return static_cast<Enum>(Info::getEmptyKey()); }
+ // If an enum does not have a "fixed" underlying type, it may be UB to cast
+ // some values of the underlying type to the enum. We use an "extra" constexpr
+ // local to ensure that such UB would trigger "static assertion expression is
+ // not an integral constant expression", rather than runtime UB.
+ //
+ // If you hit this error, you can fix by switching to `enum class`, or adding
+ // an explicit underlying type (e.g. `enum X : int`) to the enum's definition.
+
+ static constexpr Enum getEmptyKey() {
+ constexpr Enum V = static_cast<Enum>(Info::getEmptyKey());
+ return V;
+ }
- static Enum getTombstoneKey() {
- return static_cast<Enum>(Info::getTombstoneKey());
+ static constexpr Enum getTombstoneKey() {
+ constexpr Enum V = static_cast<Enum>(Info::getTombstoneKey());
+ return V;
}
static unsigned getHashValue(const Enum &Val) {
@@ -326,9 +340,11 @@ template <typename T> struct DenseMapInfo<std::optional<T>> {
using Optional = std::optional<T>;
using Info = DenseMapInfo<T>;
- static inline Optional getEmptyKey() { return {Info::getEmptyKey()}; }
+ static constexpr Optional getEmptyKey() { return {Info::getEmptyKey()}; }
- static inline Optional getTombstoneKey() { return {Info::getTombstoneKey()}; }
+ static constexpr Optional getTombstoneKey() {
+ return {Info::getTombstoneKey()};
+ }
static unsigned getHashValue(const Optional &OptionalVal) {
return detail::combineHashValue(
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 73bfe1a..af6e534 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -236,8 +236,8 @@ public:
/// In same cases when the dependency check fails we can still
/// vectorize the loop with a dynamic array access check.
- bool shouldRetryWithRuntimeCheck() const {
- return FoundNonConstantDistanceDependence &&
+ bool shouldRetryWithRuntimeChecks() const {
+ return ShouldRetryWithRuntimeChecks &&
Status == VectorizationSafetyStatus::PossiblySafeWithRtChecks;
}
@@ -327,9 +327,9 @@ private:
uint64_t MaxStoreLoadForwardSafeDistanceInBits =
std::numeric_limits<uint64_t>::max();
- /// If we see a non-constant dependence distance we can still try to
- /// vectorize this loop with runtime checks.
- bool FoundNonConstantDistanceDependence = false;
+ /// Whether we should try to vectorize the loop with runtime checks, if the
+ /// dependencies are not safe.
+ bool ShouldRetryWithRuntimeChecks = false;
/// Result of the dependence checks, indicating whether the checked
/// dependences are safe for vectorization, require RT checks or are known to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 98b793a..7928835 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1930,7 +1930,7 @@ public:
/// Returns a bitmask constructed from the target-features or fmv-features
/// metadata of a function.
- LLVM_ABI uint64_t getFeatureMask(const Function &F) const;
+ LLVM_ABI APInt getFeatureMask(const Function &F) const;
/// Returns true if this is an instance of a function with multiple versions.
LLVM_ABI bool isMultiversionedFunction(const Function &F) const;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index ddc8a5e..2ea87b3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1126,7 +1126,9 @@ public:
virtual bool hasArmWideBranch(bool) const { return false; }
- virtual uint64_t getFeatureMask(const Function &F) const { return 0; }
+ virtual APInt getFeatureMask(const Function &F) const {
+ return APInt::getZero(32);
+ }
virtual bool isMultiversionedFunction(const Function &F) const {
return false;
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index c7e4bdf..a2311d2 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -181,6 +181,7 @@ enum Kind {
kw_amdgpu_cs_chain_preserve,
kw_amdgpu_kernel,
kw_amdgpu_gfx,
+ kw_amdgpu_gfx_whole_wave,
kw_tailcc,
kw_m68k_rtdcc,
kw_graalcc,
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index e4f82ad..ad35d7f 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -362,6 +362,7 @@ enum {
ELFOSABI_FENIXOS = 16, // FenixOS
ELFOSABI_CLOUDABI = 17, // Nuxi CloudABI
ELFOSABI_CUDA = 51, // NVIDIA CUDA architecture.
+ ELFOSABI_CUDA_V2 = 41, // NVIDIA CUDA architecture.
ELFOSABI_FIRST_ARCH = 64, // First architecture-specific OS ABI
ELFOSABI_AMDGPU_HSA = 64, // AMD HSA runtime
ELFOSABI_AMDGPU_PAL = 65, // AMD PAL runtime
@@ -385,6 +386,12 @@ enum {
ELFABIVERSION_AMDGPU_HSA_V6 = 4,
};
+// CUDA OS ABI Version identification.
+enum {
+ ELFABIVERSION_CUDA_V1 = 7,
+ ELFABIVERSION_CUDA_V2 = 8,
+};
+
#define ELF_RELOC(name, value) name = value,
// X86_64 relocations.
@@ -921,7 +928,7 @@ enum {
// NVPTX specific e_flags.
enum : unsigned {
- // Processor selection mask for EF_CUDA_SM* values.
+ // Processor selection mask for EF_CUDA_SM* values prior to blackwell.
EF_CUDA_SM = 0xff,
// SM based processor values.
@@ -954,12 +961,22 @@ enum : unsigned {
// The target is using 64-bit addressing.
EF_CUDA_64BIT_ADDRESS = 0x400,
// Set when using the sm_90a processor.
- EF_CUDA_ACCELERATORS = 0x800,
+ EF_CUDA_ACCELERATORS_V1 = 0x800,
// Undocumented software feature.
EF_CUDA_SW_FLAG_V2 = 0x1000,
// Virtual processor selection mask for EF_CUDA_VIRTUAL_SM* values.
EF_CUDA_VIRTUAL_SM = 0xff0000,
+
+ // Processor selection mask for EF_CUDA_SM* values following blackwell.
+ EF_CUDA_SM_MASK = 0xff00,
+
+ // SM based processor values.
+ EF_CUDA_SM100 = 0x6400,
+ EF_CUDA_SM120 = 0x7800,
+
+ // Set when using an accelerator variant like sm_100a.
+ EF_CUDA_ACCELERATORS = 0x8,
};
// ELF Relocation types for BPF
diff --git a/llvm/include/llvm/BinaryFormat/SFrame.h b/llvm/include/llvm/BinaryFormat/SFrame.h
index 16d3b16..98dbe38 100644
--- a/llvm/include/llvm/BinaryFormat/SFrame.h
+++ b/llvm/include/llvm/BinaryFormat/SFrame.h
@@ -15,33 +15,36 @@
#ifndef LLVM_BINARYFORMAT_SFRAME_H
#define LLVM_BINARYFORMAT_SFRAME_H
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitmaskEnum.h"
#include "llvm/Support/DataTypes.h"
#include "llvm/Support/Endian.h"
-namespace llvm::sframe {
+namespace llvm {
+
+template <typename T> struct EnumEntry;
+
+namespace sframe {
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
constexpr uint16_t Magic = 0xdee2;
enum class Version : uint8_t {
- V1 = 1,
- V2 = 2,
+#define HANDLE_SFRAME_VERSION(CODE, NAME) NAME = CODE,
+#include "llvm/BinaryFormat/SFrameConstants.def"
};
enum class Flags : uint8_t {
- FDESorted = 0x01,
- FramePointer = 0x02,
- FDEFuncStartPCRel = 0x04,
+#define HANDLE_SFRAME_FLAG(CODE, NAME) NAME = CODE,
+#include "llvm/BinaryFormat/SFrameConstants.def"
V2AllFlags = FDESorted | FramePointer | FDEFuncStartPCRel,
LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/0xff),
};
enum class ABI : uint8_t {
- AArch64EndianBig = 1,
- AArch64EndianLittle = 2,
- AMD64EndianLittle = 3,
+#define HANDLE_SFRAME_ABI(CODE, NAME) NAME = CODE,
+#include "llvm/BinaryFormat/SFrameConstants.def"
};
/// SFrame FRE Types. Bits 0-3 of FuncDescEntry.Info.
@@ -160,6 +163,11 @@ template <endianness E> using FrameRowEntryAddr1 = FrameRowEntry<uint8_t, E>;
template <endianness E> using FrameRowEntryAddr2 = FrameRowEntry<uint16_t, E>;
template <endianness E> using FrameRowEntryAddr4 = FrameRowEntry<uint32_t, E>;
-} // namespace llvm::sframe
+ArrayRef<EnumEntry<Version>> getVersions();
+ArrayRef<EnumEntry<Flags>> getFlags();
+ArrayRef<EnumEntry<ABI>> getABIs();
+
+} // namespace sframe
+} // namespace llvm
#endif // LLVM_BINARYFORMAT_SFRAME_H
diff --git a/llvm/include/llvm/BinaryFormat/SFrameConstants.def b/llvm/include/llvm/BinaryFormat/SFrameConstants.def
new file mode 100644
index 0000000..643b15f
--- /dev/null
+++ b/llvm/include/llvm/BinaryFormat/SFrameConstants.def
@@ -0,0 +1,39 @@
+//===- SFrameConstants.def --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if !(defined(HANDLE_SFRAME_VERSION) || defined(HANDLE_SFRAME_FLAG) || \
+ defined(HANDLE_SFRAME_ABI))
+#error "Missing HANDLE_SFRAME definition"
+#endif
+
+#ifndef HANDLE_SFRAME_VERSION
+#define HANDLE_SFRAME_VERSION(CODE, NAME)
+#endif
+
+#ifndef HANDLE_SFRAME_FLAG
+#define HANDLE_SFRAME_FLAG(CODE, NAME)
+#endif
+
+#ifndef HANDLE_SFRAME_ABI
+#define HANDLE_SFRAME_ABI(CODE, NAME)
+#endif
+
+HANDLE_SFRAME_VERSION(0x01, V1)
+HANDLE_SFRAME_VERSION(0x02, V2)
+
+HANDLE_SFRAME_FLAG(0x01, FDESorted)
+HANDLE_SFRAME_FLAG(0x02, FramePointer)
+HANDLE_SFRAME_FLAG(0x04, FDEFuncStartPCRel)
+
+HANDLE_SFRAME_ABI(0x01, AArch64EndianBig)
+HANDLE_SFRAME_ABI(0x02, AArch64EndianLittle)
+HANDLE_SFRAME_ABI(0x03, AMD64EndianLittle)
+
+#undef HANDLE_SFRAME_VERSION
+#undef HANDLE_SFRAME_FLAG
+#undef HANDLE_SFRAME_ABI
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 1d7c414..1fcedcd 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1985,11 +1985,6 @@ public:
cast<VectorType>(Args[0]->getType()), {}, CostKind, Index,
cast<VectorType>(Args[1]->getType()));
}
- case Intrinsic::vector_reverse: {
- return thisT()->getShuffleCost(TTI::SK_Reverse, cast<VectorType>(RetTy),
- cast<VectorType>(Args[0]->getType()), {},
- CostKind, 0, cast<VectorType>(RetTy));
- }
case Intrinsic::vector_splice: {
unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
return thisT()->getShuffleCost(TTI::SK_Splice, cast<VectorType>(RetTy),
@@ -2458,6 +2453,10 @@ public:
thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
return Cost;
}
+ case Intrinsic::vector_reverse:
+ return thisT()->getShuffleCost(TTI::SK_Reverse, cast<VectorType>(RetTy),
+ cast<VectorType>(ICA.getArgTypes()[0]), {},
+ CostKind, 0, cast<VectorType>(RetTy));
case Intrinsic::get_active_lane_mask: {
Type *ArgTy = ICA.getArgTypes()[0];
EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
index da73238..490d1a3 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
@@ -103,6 +103,20 @@ public:
/// \return The known alignment for the pointer-like value \p R.
Align computeKnownAlignment(Register R, unsigned Depth = 0);
+ /// If a G_SHL/G_ASHR/G_LSHR node with shift operand \p R has shift amounts
+ /// that are all less than the element bit-width of the shift node, return the
+ /// valid constant range.
+ std::optional<ConstantRange>
+ getValidShiftAmountRange(Register R, const APInt &DemandedElts,
+ unsigned Depth);
+
+ /// If a G_SHL/G_ASHR/G_LSHR node with shift operand \p R has shift amounts
+ /// that are all less than the element bit-width of the shift node, return the
+ /// minimum possible value.
+ std::optional<uint64_t> getValidMinimumShiftAmount(Register R,
+ const APInt &DemandedElts,
+ unsigned Depth = 0);
+
/// Determine which floating-point classes are valid for \p V, and return them
/// in KnownFPClass bit sets.
///
diff --git a/llvm/include/llvm/CodeGen/LinkAllAsmWriterComponents.h b/llvm/include/llvm/CodeGen/LinkAllAsmWriterComponents.h
index c22f9d4..c70413d 100644
--- a/llvm/include/llvm/CodeGen/LinkAllAsmWriterComponents.h
+++ b/llvm/include/llvm/CodeGen/LinkAllAsmWriterComponents.h
@@ -15,19 +15,17 @@
#define LLVM_CODEGEN_LINKALLASMWRITERCOMPONENTS_H
#include "llvm/IR/BuiltinGCs.h"
-#include <cstdlib>
+#include "llvm/Support/AlwaysTrue.h"
namespace {
struct ForceAsmWriterLinking {
ForceAsmWriterLinking() {
// We must reference the plug-ins in such a way that compilers will not
// delete it all as dead code, even with whole program optimization,
- // yet is effectively a NO-OP. As the compiler isn't smart enough
- // to know that getenv() never returns -1, this will do the job.
- // This is so that globals in the translation units where these functions
- // are defined are forced to be initialized, populating various
- // registries.
- if (std::getenv("bar") != (char*) -1)
+ // yet is effectively a NO-OP. This is so that globals in the translation
+ // units where these functions are defined are forced to be initialized,
+ // populating various registries.
+ if (llvm::getNonFoldableAlwaysTrue())
return;
llvm::linkOcamlGCPrinter();
diff --git a/llvm/include/llvm/CodeGen/LinkAllCodegenComponents.h b/llvm/include/llvm/CodeGen/LinkAllCodegenComponents.h
index 6f56682..f0a01d2 100644
--- a/llvm/include/llvm/CodeGen/LinkAllCodegenComponents.h
+++ b/llvm/include/llvm/CodeGen/LinkAllCodegenComponents.h
@@ -16,20 +16,18 @@
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Support/AlwaysTrue.h"
#include "llvm/Target/TargetMachine.h"
-#include <cstdlib>
namespace {
struct ForceCodegenLinking {
ForceCodegenLinking() {
// We must reference the passes in such a way that compilers will not
// delete it all as dead code, even with whole program optimization,
- // yet is effectively a NO-OP. As the compiler isn't smart enough
- // to know that getenv() never returns -1, this will do the job.
- // This is so that globals in the translation units where these functions
- // are defined are forced to be initialized, populating various
- // registries.
- if (std::getenv("bar") != (char*) -1)
+ // yet is effectively a NO-OP. This is so that globals in the translation
+ // units where these functions are defined are forced to be initialized,
+ // populating various registries.
+ if (llvm::getNonFoldableAlwaysTrue())
return;
(void) llvm::createFastRegisterAllocator();
diff --git a/llvm/include/llvm/CodeGen/MachineInstrBundle.h b/llvm/include/llvm/CodeGen/MachineInstrBundle.h
index d324236..ebf7534 100644
--- a/llvm/include/llvm/CodeGen/MachineInstrBundle.h
+++ b/llvm/include/llvm/CodeGen/MachineInstrBundle.h
@@ -15,6 +15,7 @@
#define LLVM_CODEGEN_MACHINEINSTRBUNDLE_H
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/Support/Compiler.h"
namespace llvm {
@@ -294,6 +295,12 @@ LLVM_ABI PhysRegInfo AnalyzePhysRegInBundle(const MachineInstr &MI,
Register Reg,
const TargetRegisterInfo *TRI);
+class FinalizeBundleTestPass : public PassInfoMixin<FinalizeBundleTestPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
} // End llvm namespace
#endif
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index e7a7091..efda7eb 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -65,7 +65,7 @@
//
// void <SubTarget>Subtarget::
// overrideSchedPolicy(MachineSchedPolicy &Policy,
-// unsigned NumRegionInstrs) const {
+// const SchedRegion &Region) const {
// Policy.<Flag> = true;
// }
//
@@ -218,6 +218,22 @@ struct MachineSchedPolicy {
MachineSchedPolicy() = default;
};
+/// A region of an MBB for scheduling.
+struct SchedRegion {
+ /// RegionBegin is the first instruction in the scheduling region, and
+ /// RegionEnd is either MBB->end() or the scheduling boundary after the
+ /// last instruction in the scheduling region. These iterators cannot refer
+ /// to instructions outside of the identified scheduling region because
+ /// those may be reordered before scheduling this region.
+ MachineBasicBlock::iterator RegionBegin;
+ MachineBasicBlock::iterator RegionEnd;
+ unsigned NumRegionInstrs;
+
+ SchedRegion(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E,
+ unsigned N)
+ : RegionBegin(B), RegionEnd(E), NumRegionInstrs(N) {}
+};
+
/// MachineSchedStrategy - Interface to the scheduling algorithm used by
/// ScheduleDAGMI.
///
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 714285e..095a40e 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -438,10 +438,6 @@ LLVM_ABI extern char &UnpackMachineBundlesID;
LLVM_ABI FunctionPass *
createUnpackMachineBundles(std::function<bool(const MachineFunction &)> Ftor);
-/// FinalizeMachineBundles - This pass finalize machine instruction
-/// bundles (created earlier, e.g. during pre-RA scheduling).
-LLVM_ABI extern char &FinalizeMachineBundlesID;
-
/// StackMapLiveness - This pass analyses the register live-out set of
/// stackmap/patchpoint intrinsics and attaches the calculated information to
/// the intrinsic for later emission to the StackMap.
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 657951d..eac8e14 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1202,13 +1202,16 @@ public:
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
ArrayRef<SDValue> Ops, const SDNodeFlags Flags);
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL,
- ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops);
+ ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops,
+ const SDNodeFlags Flags);
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
ArrayRef<SDValue> Ops, const SDNodeFlags Flags);
// Use flags from current flag inserter.
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
ArrayRef<SDValue> Ops);
+ LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL,
+ ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops);
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
ArrayRef<SDValue> Ops);
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
@@ -1346,9 +1349,10 @@ public:
/// Helper function to make it easier to build SelectCC's if you just have an
/// ISD::CondCode instead of an SDValue.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True,
- SDValue False, ISD::CondCode Cond) {
+ SDValue False, ISD::CondCode Cond,
+ SDNodeFlags Flags = SDNodeFlags()) {
return getNode(ISD::SELECT_CC, DL, True.getValueType(), LHS, RHS, True,
- False, getCondCode(Cond));
+ False, getCondCode(Cond), Flags);
}
/// Try to simplify a select/vselect into 1 of its operands or a constant.
@@ -1425,10 +1429,9 @@ public:
/// Creates a LifetimeSDNode that starts (`IsStart==true`) or ends
/// (`IsStart==false`) the lifetime of the portion of `FrameIndex` between
- /// offsets `Offset` and `Offset + Size`.
+ /// offsets `0` and `Size`.
LLVM_ABI SDValue getLifetimeNode(bool IsStart, const SDLoc &dl, SDValue Chain,
- int FrameIndex, int64_t Size,
- int64_t Offset = -1);
+ int FrameIndex, int64_t Size);
/// Creates a PseudoProbeSDNode with function GUID `Guid` and
/// the index of the block `Index` it is probing, as well as the attributes
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 5d9937f..8e9c1f7 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -2004,25 +2004,17 @@ public:
class LifetimeSDNode : public SDNode {
friend class SelectionDAG;
int64_t Size;
- int64_t Offset; // -1 if offset is unknown.
LifetimeSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
- SDVTList VTs, int64_t Size, int64_t Offset)
- : SDNode(Opcode, Order, dl, VTs), Size(Size), Offset(Offset) {}
+ SDVTList VTs, int64_t Size)
+ : SDNode(Opcode, Order, dl, VTs), Size(Size) {}
+
public:
int64_t getFrameIndex() const {
return cast<FrameIndexSDNode>(getOperand(1))->getIndex();
}
- bool hasOffset() const { return Offset >= 0; }
- int64_t getOffset() const {
- assert(hasOffset() && "offset is unknown");
- return Offset;
- }
- int64_t getSize() const {
- assert(hasOffset() && "offset is unknown");
- return Size;
- }
+ int64_t getSize() const { return Size; }
// Methods to support isa and dyn_cast
static bool classof(const SDNode *N) {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 1a548a5..cbdc1b6 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3219,25 +3219,19 @@ public:
/// Lower an interleaved store to target specific intrinsics. Return
/// true on success.
///
- /// \p SI is the vector store instruction.
+ /// \p SI is the vector store instruction. Can be either a plain store
+ /// or a vp.store.
+ /// \p Mask is a per-segment (i.e. number of lanes equal to that of one
+ /// component being interwoven) mask. Can be nullptr, in which case the
+ /// result is unconditional.
/// \p SVI is the shufflevector to RE-interleave the stored vector.
/// \p Factor is the interleave factor.
- virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ virtual bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+ ShuffleVectorInst *SVI,
unsigned Factor) const {
return false;
}
- /// Lower an interleaved store to target specific intrinsics. Return
- /// true on success.
- ///
- /// \p Store is the vp.store instruction.
- /// \p Mask is a mask value
- /// \p InterleaveOps is a list of values being interleaved.
- virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
- ArrayRef<Value *> InterleaveOps) const {
- return false;
- }
-
/// Lower a deinterleave intrinsic to a target specific load intrinsic.
/// Return true on success. Currently only supports
/// llvm.vector.deinterleave{2,3,5,7}
diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index 45e67d8..a8c7a8a 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -54,6 +54,7 @@ class TargetRegisterClass;
class TargetRegisterInfo;
class TargetSchedModel;
class Triple;
+struct SchedRegion;
//===----------------------------------------------------------------------===//
///
@@ -231,7 +232,7 @@ public:
/// scheduling heuristics (no custom MachineSchedStrategy) to make
/// changes to the generic scheduling policy.
virtual void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {}
+ const SchedRegion &Region) const {}
/// Override generic post-ra scheduling policy within a region.
///
@@ -241,7 +242,7 @@ public:
/// Note that some options like tracking register pressure won't take effect
/// in post-ra scheduling.
virtual void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {}
+ const SchedRegion &Region) const {}
// Perform target-specific adjustments to the latency of a schedule
// dependency.
diff --git a/llvm/include/llvm/Config/abi-breaking.h.cmake b/llvm/include/llvm/Config/abi-breaking.h.cmake
index 2d27e02..330f360 100644
--- a/llvm/include/llvm/Config/abi-breaking.h.cmake
+++ b/llvm/include/llvm/Config/abi-breaking.h.cmake
@@ -12,12 +12,41 @@
#ifndef LLVM_ABI_BREAKING_CHECKS_H
#define LLVM_ABI_BREAKING_CHECKS_H
+// llvm-config.h is required for LLVM_ENABLE_LLVM_EXPORT_ANNOTATIONS
+#include "llvm/Config/llvm-config.h"
+
/* Define to enable checks that alter the LLVM C++ ABI */
#cmakedefine01 LLVM_ENABLE_ABI_BREAKING_CHECKS
/* Define to enable reverse iteration of unordered llvm containers */
#cmakedefine01 LLVM_ENABLE_REVERSE_ITERATION
+#if !defined(__has_attribute)
+#define __has_attribute(attribute) 0
+#endif
+
+// Properly annotate EnableABIBreakingChecks or DisableABIBreakingChecks for
+// export from shared library.
+// TODO(https://github.com/llvm/llvm-project/issues/145406): eliminate need for
+// two preprocessor definitions to gate LLVM_ABI macro definitions.
+#if defined(LLVM_BUILD_STATIC) || !defined(LLVM_ENABLE_LLVM_EXPORT_ANNOTATIONS)
+#define ABI_BREAKING_EXPORT_ABI
+#else
+#if defined(_WIN32)
+#if defined(LLVM_EXPORTS)
+#define ABI_BREAKING_EXPORT_ABI __declspec(dllexport)
+#else
+#define ABI_BREAKING_EXPORT_ABI __declspec(dllimport)
+#endif
+#else
+#if __has_attribute(visibility)
+#define ABI_BREAKING_EXPORT_ABI __attribute__((__visibility__("default")))
+#else
+#define ABI_BREAKING_EXPORT_ABI
+#endif
+#endif
+#endif
+
/* Allow selectively disabling link-time mismatch checking so that header-only
ADT content from LLVM can be used without linking libSupport. */
#if !defined(LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING) || !LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING
@@ -43,12 +72,12 @@
#endif
namespace llvm {
#if LLVM_ENABLE_ABI_BREAKING_CHECKS
-extern int EnableABIBreakingChecks;
+ABI_BREAKING_EXPORT_ABI extern int EnableABIBreakingChecks;
LLVM_HIDDEN_VISIBILITY
__attribute__((weak)) int *VerifyEnableABIBreakingChecks =
&EnableABIBreakingChecks;
#else
-extern int DisableABIBreakingChecks;
+ABI_BREAKING_EXPORT_ABI extern int DisableABIBreakingChecks;
LLVM_HIDDEN_VISIBILITY
__attribute__((weak)) int *VerifyDisableABIBreakingChecks =
&DisableABIBreakingChecks;
diff --git a/llvm/include/llvm/Demangle/Demangle.h b/llvm/include/llvm/Demangle/Demangle.h
index 21e7457..d9b08b2 100644
--- a/llvm/include/llvm/Demangle/Demangle.h
+++ b/llvm/include/llvm/Demangle/Demangle.h
@@ -9,6 +9,7 @@
#ifndef LLVM_DEMANGLE_DEMANGLE_H
#define LLVM_DEMANGLE_DEMANGLE_H
+#include "DemangleConfig.h"
#include <cstddef>
#include <optional>
#include <string>
@@ -33,7 +34,8 @@ enum : int {
/// Returns a non-NULL pointer to a NUL-terminated C style string
/// that should be explicitly freed, if successful. Otherwise, may return
/// nullptr if mangled_name is not a valid mangling or is nullptr.
-char *itaniumDemangle(std::string_view mangled_name, bool ParseParams = true);
+DEMANGLE_ABI char *itaniumDemangle(std::string_view mangled_name,
+ bool ParseParams = true);
enum MSDemangleFlags {
MSDF_None = 0,
@@ -52,87 +54,90 @@ enum MSDemangleFlags {
/// bytes of the input string were consumed.
/// status receives one of the demangle_ enum entries above if it's not nullptr.
/// Flags controls various details of the demangled representation.
-char *microsoftDemangle(std::string_view mangled_name, size_t *n_read,
- int *status, MSDemangleFlags Flags = MSDF_None);
+DEMANGLE_ABI char *microsoftDemangle(std::string_view mangled_name,
+ size_t *n_read, int *status,
+ MSDemangleFlags Flags = MSDF_None);
-std::optional<size_t>
+DEMANGLE_ABI std::optional<size_t>
getArm64ECInsertionPointInMangledName(std::string_view MangledName);
// Demangles a Rust v0 mangled symbol.
-char *rustDemangle(std::string_view MangledName);
+DEMANGLE_ABI char *rustDemangle(std::string_view MangledName);
// Demangles a D mangled symbol.
-char *dlangDemangle(std::string_view MangledName);
+DEMANGLE_ABI char *dlangDemangle(std::string_view MangledName);
/// Attempt to demangle a string using different demangling schemes.
/// The function uses heuristics to determine which demangling scheme to use.
/// \param MangledName - reference to string to demangle.
/// \returns - the demangled string, or a copy of the input string if no
/// demangling occurred.
-std::string demangle(std::string_view MangledName);
+DEMANGLE_ABI std::string demangle(std::string_view MangledName);
-bool nonMicrosoftDemangle(std::string_view MangledName, std::string &Result,
- bool CanHaveLeadingDot = true,
- bool ParseParams = true);
+DEMANGLE_ABI bool nonMicrosoftDemangle(std::string_view MangledName,
+ std::string &Result,
+ bool CanHaveLeadingDot = true,
+ bool ParseParams = true);
/// "Partial" demangler. This supports demangling a string into an AST
/// (typically an intermediate stage in itaniumDemangle) and querying certain
/// properties or partially printing the demangled name.
struct ItaniumPartialDemangler {
- ItaniumPartialDemangler();
+ DEMANGLE_ABI ItaniumPartialDemangler();
- ItaniumPartialDemangler(ItaniumPartialDemangler &&Other);
- ItaniumPartialDemangler &operator=(ItaniumPartialDemangler &&Other);
+ DEMANGLE_ABI ItaniumPartialDemangler(ItaniumPartialDemangler &&Other);
+ DEMANGLE_ABI ItaniumPartialDemangler &
+ operator=(ItaniumPartialDemangler &&Other);
/// Demangle into an AST. Subsequent calls to the rest of the member functions
/// implicitly operate on the AST this produces.
/// \return true on error, false otherwise
- bool partialDemangle(const char *MangledName);
+ DEMANGLE_ABI bool partialDemangle(const char *MangledName);
/// Just print the entire mangled name into Buf. Buf and N behave like the
/// second and third parameters to __cxa_demangle.
- char *finishDemangle(char *Buf, size_t *N) const;
+ DEMANGLE_ABI char *finishDemangle(char *Buf, size_t *N) const;
/// See \ref finishDemangle
///
/// \param[in] OB A llvm::itanium_demangle::OutputBuffer that the demangled
/// name will be printed into.
///
- char *finishDemangle(void *OB) const;
+ DEMANGLE_ABI char *finishDemangle(void *OB) const;
/// Get the base name of a function. This doesn't include trailing template
/// arguments, ie for "a::b<int>" this function returns "b".
- char *getFunctionBaseName(char *Buf, size_t *N) const;
+ DEMANGLE_ABI char *getFunctionBaseName(char *Buf, size_t *N) const;
/// Get the context name for a function. For "a::b::c", this function returns
/// "a::b".
- char *getFunctionDeclContextName(char *Buf, size_t *N) const;
+ DEMANGLE_ABI char *getFunctionDeclContextName(char *Buf, size_t *N) const;
/// Get the entire name of this function.
- char *getFunctionName(char *Buf, size_t *N) const;
+ DEMANGLE_ABI char *getFunctionName(char *Buf, size_t *N) const;
/// Get the parameters for this function.
- char *getFunctionParameters(char *Buf, size_t *N) const;
- char *getFunctionReturnType(char *Buf, size_t *N) const;
+ DEMANGLE_ABI char *getFunctionParameters(char *Buf, size_t *N) const;
+ DEMANGLE_ABI char *getFunctionReturnType(char *Buf, size_t *N) const;
/// If this function has any cv or reference qualifiers. These imply that
/// the function is a non-static member function.
- bool hasFunctionQualifiers() const;
+ DEMANGLE_ABI bool hasFunctionQualifiers() const;
/// If this symbol describes a constructor or destructor.
- bool isCtorOrDtor() const;
+ DEMANGLE_ABI bool isCtorOrDtor() const;
/// If this symbol describes a function.
- bool isFunction() const;
+ DEMANGLE_ABI bool isFunction() const;
/// If this symbol describes a variable.
- bool isData() const;
+ DEMANGLE_ABI bool isData() const;
/// If this symbol is a <special-name>. These are generally implicitly
/// generated by the implementation, such as vtables and typeinfo names.
- bool isSpecialName() const;
+ DEMANGLE_ABI bool isSpecialName() const;
- ~ItaniumPartialDemangler();
+ DEMANGLE_ABI ~ItaniumPartialDemangler();
private:
void *RootNode;
diff --git a/llvm/include/llvm/Demangle/DemangleConfig.h b/llvm/include/llvm/Demangle/DemangleConfig.h
index 30f72ff..8807a0e 100644
--- a/llvm/include/llvm/Demangle/DemangleConfig.h
+++ b/llvm/include/llvm/Demangle/DemangleConfig.h
@@ -94,4 +94,24 @@
#define DEMANGLE_NAMESPACE_BEGIN namespace llvm { namespace itanium_demangle {
#define DEMANGLE_NAMESPACE_END } }
+/// DEMANGLE_ABI is the export/visibility macro used to mark symbols delcared in
+/// llvm/Demangle as exported when built as a shared library.
+#if defined(LLVM_BUILD_STATIC) || !defined(LLVM_ENABLE_LLVM_EXPORT_ANNOTATIONS)
+#define DEMANGLE_ABI
+#else
+#if defined(_WIN32)
+#if defined(LLVM_EXPORTS)
+#define DEMANGLE_ABI __declspec(dllexport)
+#else
+#define DEMANGLE_ABI__declspec(dllimport)
+#endif
+#else
+#if __has_attribute(visibility)
+#define DEMANGLE_ABI __attribute__((__visibility__("default")))
+#else
+#define DEMANGLE_ABI
+#endif
+#endif
+#endif
+
#endif
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 5533652..62d427c 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -3049,7 +3049,8 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
Node *parse(bool ParseParams = true);
};
-const char* parse_discriminator(const char* first, const char* last);
+DEMANGLE_ABI const char *parse_discriminator(const char *first,
+ const char *last);
// <name> ::= <nested-name> // N
// ::= <local-name> # See Scope Encoding below // Z
diff --git a/llvm/include/llvm/Demangle/MicrosoftDemangle.h b/llvm/include/llvm/Demangle/MicrosoftDemangle.h
index b9a25e3..a2af875 100644
--- a/llvm/include/llvm/Demangle/MicrosoftDemangle.h
+++ b/llvm/include/llvm/Demangle/MicrosoftDemangle.h
@@ -10,6 +10,7 @@
#define LLVM_DEMANGLE_MICROSOFTDEMANGLE_H
#include "llvm/Demangle/Demangle.h"
+#include "llvm/Demangle/DemangleConfig.h"
#include "llvm/Demangle/MicrosoftDemangleNodes.h"
#include <cassert>
@@ -151,14 +152,14 @@ public:
// You are supposed to call parse() first and then check if error is true. If
// it is false, call output() to write the formatted name to the given stream.
- SymbolNode *parse(std::string_view &MangledName);
+ DEMANGLE_ABI SymbolNode *parse(std::string_view &MangledName);
- TagTypeNode *parseTagUniqueName(std::string_view &MangledName);
+ DEMANGLE_ABI TagTypeNode *parseTagUniqueName(std::string_view &MangledName);
// True if an error occurred.
bool Error = false;
- void dumpBackReferences();
+ DEMANGLE_ABI void dumpBackReferences();
private:
SymbolNode *demangleEncodedSymbol(std::string_view &MangledName,
diff --git a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
index a9cfe72..155cfe8 100644
--- a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
+++ b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
@@ -13,6 +13,7 @@
#ifndef LLVM_DEMANGLE_MICROSOFTDEMANGLENODES_H
#define LLVM_DEMANGLE_MICROSOFTDEMANGLENODES_H
+#include "DemangleConfig.h"
#include <array>
#include <cstdint>
#include <string>
@@ -281,7 +282,7 @@ struct Node {
virtual void output(OutputBuffer &OB, OutputFlags Flags) const = 0;
- std::string toString(OutputFlags Flags = OF_Default) const;
+ DEMANGLE_ABI std::string toString(OutputFlags Flags = OF_Default) const;
private:
NodeKind Kind;
@@ -332,7 +333,7 @@ struct TypeNode : public Node {
Qualifiers Quals = Q_None;
};
-struct PrimitiveTypeNode : public TypeNode {
+struct DEMANGLE_ABI PrimitiveTypeNode : public TypeNode {
explicit PrimitiveTypeNode(PrimitiveKind K)
: TypeNode(NodeKind::PrimitiveType), PrimKind(K) {}
@@ -346,7 +347,7 @@ struct PrimitiveTypeNode : public TypeNode {
PrimitiveKind PrimKind;
};
-struct FunctionSignatureNode : public TypeNode {
+struct DEMANGLE_ABI FunctionSignatureNode : public TypeNode {
explicit FunctionSignatureNode(NodeKind K) : TypeNode(K) {}
FunctionSignatureNode() : TypeNode(NodeKind::FunctionSignature) {}
@@ -394,10 +395,11 @@ struct IdentifierNode : public Node {
NodeArrayNode *TemplateParams = nullptr;
protected:
- void outputTemplateParameters(OutputBuffer &OB, OutputFlags Flags) const;
+ DEMANGLE_ABI void outputTemplateParameters(OutputBuffer &OB,
+ OutputFlags Flags) const;
};
-struct VcallThunkIdentifierNode : public IdentifierNode {
+struct DEMANGLE_ABI VcallThunkIdentifierNode : public IdentifierNode {
VcallThunkIdentifierNode() : IdentifierNode(NodeKind::VcallThunkIdentifier) {}
void output(OutputBuffer &OB, OutputFlags Flags) const override;
@@ -409,7 +411,7 @@ struct VcallThunkIdentifierNode : public IdentifierNode {
uint64_t OffsetInVTable = 0;
};
-struct DynamicStructorIdentifierNode : public IdentifierNode {
+struct DEMANGLE_ABI DynamicStructorIdentifierNode : public IdentifierNode {
DynamicStructorIdentifierNode()
: IdentifierNode(NodeKind::DynamicStructorIdentifier) {}
@@ -424,7 +426,7 @@ struct DynamicStructorIdentifierNode : public IdentifierNode {
bool IsDestructor = false;
};
-struct NamedIdentifierNode : public IdentifierNode {
+struct DEMANGLE_ABI NamedIdentifierNode : public IdentifierNode {
NamedIdentifierNode() : IdentifierNode(NodeKind::NamedIdentifier) {}
void output(OutputBuffer &OB, OutputFlags Flags) const override;
@@ -436,7 +438,7 @@ struct NamedIdentifierNode : public IdentifierNode {
std::string_view Name;
};
-struct IntrinsicFunctionIdentifierNode : public IdentifierNode {
+struct DEMANGLE_ABI IntrinsicFunctionIdentifierNode : public IdentifierNode {
explicit IntrinsicFunctionIdentifierNode(IntrinsicFunctionKind Operator)
: IdentifierNode(NodeKind::IntrinsicFunctionIdentifier),
Operator(Operator) {}
@@ -450,7 +452,7 @@ struct IntrinsicFunctionIdentifierNode : public IdentifierNode {
IntrinsicFunctionKind Operator;
};
-struct LiteralOperatorIdentifierNode : public IdentifierNode {
+struct DEMANGLE_ABI LiteralOperatorIdentifierNode : public IdentifierNode {
LiteralOperatorIdentifierNode()
: IdentifierNode(NodeKind::LiteralOperatorIdentifier) {}
@@ -463,7 +465,7 @@ struct LiteralOperatorIdentifierNode : public IdentifierNode {
std::string_view Name;
};
-struct LocalStaticGuardIdentifierNode : public IdentifierNode {
+struct DEMANGLE_ABI LocalStaticGuardIdentifierNode : public IdentifierNode {
LocalStaticGuardIdentifierNode()
: IdentifierNode(NodeKind::LocalStaticGuardIdentifier) {}
@@ -477,7 +479,7 @@ struct LocalStaticGuardIdentifierNode : public IdentifierNode {
uint32_t ScopeIndex = 0;
};
-struct ConversionOperatorIdentifierNode : public IdentifierNode {
+struct DEMANGLE_ABI ConversionOperatorIdentifierNode : public IdentifierNode {
ConversionOperatorIdentifierNode()
: IdentifierNode(NodeKind::ConversionOperatorIdentifier) {}
@@ -491,7 +493,7 @@ struct ConversionOperatorIdentifierNode : public IdentifierNode {
TypeNode *TargetType = nullptr;
};
-struct StructorIdentifierNode : public IdentifierNode {
+struct DEMANGLE_ABI StructorIdentifierNode : public IdentifierNode {
StructorIdentifierNode() : IdentifierNode(NodeKind::StructorIdentifier) {}
explicit StructorIdentifierNode(bool IsDestructor)
: IdentifierNode(NodeKind::StructorIdentifier),
@@ -508,7 +510,7 @@ struct StructorIdentifierNode : public IdentifierNode {
bool IsDestructor = false;
};
-struct ThunkSignatureNode : public FunctionSignatureNode {
+struct DEMANGLE_ABI ThunkSignatureNode : public FunctionSignatureNode {
ThunkSignatureNode() : FunctionSignatureNode(NodeKind::ThunkSignature) {}
void outputPre(OutputBuffer &OB, OutputFlags Flags) const override;
@@ -528,7 +530,7 @@ struct ThunkSignatureNode : public FunctionSignatureNode {
ThisAdjustor ThisAdjust;
};
-struct PointerTypeNode : public TypeNode {
+struct DEMANGLE_ABI PointerTypeNode : public TypeNode {
PointerTypeNode() : TypeNode(NodeKind::PointerType) {}
void outputPre(OutputBuffer &OB, OutputFlags Flags) const override;
void outputPost(OutputBuffer &OB, OutputFlags Flags) const override;
@@ -550,7 +552,7 @@ struct PointerTypeNode : public TypeNode {
TypeNode *Pointee = nullptr;
};
-struct TagTypeNode : public TypeNode {
+struct DEMANGLE_ABI TagTypeNode : public TypeNode {
explicit TagTypeNode(TagKind Tag) : TypeNode(NodeKind::TagType), Tag(Tag) {}
void outputPre(OutputBuffer &OB, OutputFlags Flags) const override;
@@ -562,7 +564,7 @@ struct TagTypeNode : public TypeNode {
TagKind Tag;
};
-struct ArrayTypeNode : public TypeNode {
+struct DEMANGLE_ABI ArrayTypeNode : public TypeNode {
ArrayTypeNode() : TypeNode(NodeKind::ArrayType) {}
void outputPre(OutputBuffer &OB, OutputFlags Flags) const override;
@@ -591,7 +593,7 @@ struct IntrinsicNode : public TypeNode {
}
};
-struct CustomTypeNode : public TypeNode {
+struct DEMANGLE_ABI CustomTypeNode : public TypeNode {
CustomTypeNode() : TypeNode(NodeKind::Custom) {}
void outputPre(OutputBuffer &OB, OutputFlags Flags) const override;
@@ -602,7 +604,7 @@ struct CustomTypeNode : public TypeNode {
IdentifierNode *Identifier = nullptr;
};
-struct NodeArrayNode : public Node {
+struct DEMANGLE_ABI NodeArrayNode : public Node {
NodeArrayNode() : Node(NodeKind::NodeArray) {}
void output(OutputBuffer &OB, OutputFlags Flags) const override;
@@ -618,7 +620,7 @@ struct NodeArrayNode : public Node {
size_t Count = 0;
};
-struct QualifiedNameNode : public Node {
+struct DEMANGLE_ABI QualifiedNameNode : public Node {
QualifiedNameNode() : Node(NodeKind::QualifiedName) {}
void output(OutputBuffer &OB, OutputFlags Flags) const override;
@@ -635,7 +637,7 @@ struct QualifiedNameNode : public Node {
}
};
-struct TemplateParameterReferenceNode : public Node {
+struct DEMANGLE_ABI TemplateParameterReferenceNode : public Node {
TemplateParameterReferenceNode()
: Node(NodeKind::TemplateParameterReference) {}
@@ -653,7 +655,7 @@ struct TemplateParameterReferenceNode : public Node {
bool IsMemberPointer = false;
};
-struct IntegerLiteralNode : public Node {
+struct DEMANGLE_ABI IntegerLiteralNode : public Node {
IntegerLiteralNode() : Node(NodeKind::IntegerLiteral) {}
IntegerLiteralNode(uint64_t Value, bool IsNegative)
: Node(NodeKind::IntegerLiteral), Value(Value), IsNegative(IsNegative) {}
@@ -668,7 +670,7 @@ struct IntegerLiteralNode : public Node {
bool IsNegative = false;
};
-struct RttiBaseClassDescriptorNode : public IdentifierNode {
+struct DEMANGLE_ABI RttiBaseClassDescriptorNode : public IdentifierNode {
RttiBaseClassDescriptorNode()
: IdentifierNode(NodeKind::RttiBaseClassDescriptor) {}
@@ -684,7 +686,7 @@ struct RttiBaseClassDescriptorNode : public IdentifierNode {
uint32_t Flags = 0;
};
-struct SymbolNode : public Node {
+struct DEMANGLE_ABI SymbolNode : public Node {
explicit SymbolNode(NodeKind K) : Node(K) {}
void output(OutputBuffer &OB, OutputFlags Flags) const override;
@@ -696,7 +698,7 @@ struct SymbolNode : public Node {
QualifiedNameNode *Name = nullptr;
};
-struct SpecialTableSymbolNode : public SymbolNode {
+struct DEMANGLE_ABI SpecialTableSymbolNode : public SymbolNode {
explicit SpecialTableSymbolNode()
: SymbolNode(NodeKind::SpecialTableSymbol) {}
@@ -710,7 +712,7 @@ struct SpecialTableSymbolNode : public SymbolNode {
Qualifiers Quals = Qualifiers::Q_None;
};
-struct LocalStaticGuardVariableNode : public SymbolNode {
+struct DEMANGLE_ABI LocalStaticGuardVariableNode : public SymbolNode {
LocalStaticGuardVariableNode()
: SymbolNode(NodeKind::LocalStaticGuardVariable) {}
@@ -723,7 +725,7 @@ struct LocalStaticGuardVariableNode : public SymbolNode {
bool IsVisible = false;
};
-struct EncodedStringLiteralNode : public SymbolNode {
+struct DEMANGLE_ABI EncodedStringLiteralNode : public SymbolNode {
EncodedStringLiteralNode() : SymbolNode(NodeKind::EncodedStringLiteral) {}
void output(OutputBuffer &OB, OutputFlags Flags) const override;
@@ -737,7 +739,7 @@ struct EncodedStringLiteralNode : public SymbolNode {
CharKind Char = CharKind::Char;
};
-struct VariableSymbolNode : public SymbolNode {
+struct DEMANGLE_ABI VariableSymbolNode : public SymbolNode {
VariableSymbolNode() : SymbolNode(NodeKind::VariableSymbol) {}
void output(OutputBuffer &OB, OutputFlags Flags) const override;
@@ -750,7 +752,7 @@ struct VariableSymbolNode : public SymbolNode {
TypeNode *Type = nullptr;
};
-struct FunctionSymbolNode : public SymbolNode {
+struct DEMANGLE_ABI FunctionSymbolNode : public SymbolNode {
FunctionSymbolNode() : SymbolNode(NodeKind::FunctionSymbol) {}
void output(OutputBuffer &OB, OutputFlags Flags) const override;
@@ -762,7 +764,7 @@ struct FunctionSymbolNode : public SymbolNode {
FunctionSignatureNode *Signature = nullptr;
};
-struct PointerAuthQualifierNode : public Node {
+struct DEMANGLE_ABI PointerAuthQualifierNode : public Node {
PointerAuthQualifierNode() : Node(NodeKind::PointerAuthQualifier) {}
// __ptrauth takes three arguments:
diff --git a/llvm/include/llvm/ExecutionEngine/MCJIT.h b/llvm/include/llvm/ExecutionEngine/MCJIT.h
index c836c06..1e035c0 100644
--- a/llvm/include/llvm/ExecutionEngine/MCJIT.h
+++ b/llvm/include/llvm/ExecutionEngine/MCJIT.h
@@ -15,8 +15,8 @@
#define LLVM_EXECUTIONENGINE_MCJIT_H
#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/Support/AlwaysTrue.h"
#include "llvm/Support/Compiler.h"
-#include <cstdlib>
extern "C" LLVM_ABI void LLVMLinkInMCJIT();
@@ -24,13 +24,11 @@ namespace {
struct ForceMCJITLinking {
ForceMCJITLinking() {
// We must reference MCJIT in such a way that compilers will not
- // delete it all as dead code, even with whole program optimization,
- // yet is effectively a NO-OP. As the compiler isn't smart enough
- // to know that getenv() never returns -1, this will do the job.
- // This is so that globals in the translation units where these functions
- // are defined are forced to be initialized, populating various
- // registries.
- if (std::getenv("bar") != (char*) -1)
+ // delete it all as dead code, even with whole program optimization, yet
+ // is effectively a NO-OP. This is so that globals in the translation
+ // units where these functions are defined are forced to be initialized,
+ // populating various registries.
+ if (llvm::getNonFoldableAlwaysTrue())
return;
LLVMLinkInMCJIT();
diff --git a/llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h b/llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h
index 0aa122f..6fa51ed 100644
--- a/llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h
+++ b/llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h
@@ -15,6 +15,8 @@
#define LLVM_FRONTEND_HLSL_ROOTSIGNATUREMETADATA_H
#include "llvm/Frontend/HLSL/HLSLRootSignature.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/DXContainerRootSignature.h"
namespace llvm {
class LLVMContext;
@@ -49,6 +51,48 @@ private:
SmallVector<Metadata *> GeneratedMetadata;
};
+enum class RootSignatureElementKind {
+ Error = 0,
+ RootFlags = 1,
+ RootConstants = 2,
+ SRV = 3,
+ UAV = 4,
+ CBV = 5,
+ DescriptorTable = 6,
+ StaticSamplers = 7
+};
+
+class MetadataParser {
+public:
+ MetadataParser(MDNode *Root) : Root(Root) {}
+
+ LLVM_ABI bool ParseRootSignature(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD);
+
+private:
+ bool parseRootFlags(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
+ MDNode *RootFlagNode);
+ bool parseRootConstants(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
+ MDNode *RootConstantNode);
+ bool parseRootDescriptors(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
+ MDNode *RootDescriptorNode,
+ RootSignatureElementKind ElementKind);
+ bool parseDescriptorRange(LLVMContext *Ctx, mcdxbc::DescriptorTable &Table,
+ MDNode *RangeDescriptorNode);
+ bool parseDescriptorTable(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
+ MDNode *DescriptorTableNode);
+ bool parseRootSignatureElement(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *Element);
+ bool parseStaticSampler(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
+ MDNode *StaticSamplerNode);
+
+ bool validateRootSignature(LLVMContext *Ctx,
+ const llvm::mcdxbc::RootSignatureDesc &RSD);
+
+ MDNode *Root;
+};
+
} // namespace rootsig
} // namespace hlsl
} // namespace llvm
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index de888ff..7919f7a 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -779,16 +779,17 @@ struct LinkT {
template <typename T, typename I, typename E> //
struct MapT {
using LocatorList = ObjectListT<I, E>;
- ENUM(MapType, To, From, Tofrom, Alloc, Release, Delete);
- ENUM(MapTypeModifier, Always, Close, Present, OmpxHold);
+ ENUM(MapType, To, From, Tofrom, Storage);
+ ENUM(MapTypeModifier, Always, Close, Delete, Present, Self, OmpxHold);
+ ENUM(RefModifier, RefPtee, RefPtr, RefPtrPtee);
// See note at the definition of the MapperT type.
using Mappers = ListT<type::MapperT<I, E>>; // Not a spec name
using Iterator = type::IteratorT<T, I, E>;
using MapTypeModifiers = ListT<MapTypeModifier>; // Not a spec name
using TupleTrait = std::true_type;
- std::tuple<OPT(MapType), OPT(MapTypeModifiers), OPT(Mappers), OPT(Iterator),
- LocatorList>
+ std::tuple<OPT(MapType), OPT(MapTypeModifiers), OPT(RefModifier),
+ OPT(Mappers), OPT(Iterator), LocatorList>
t;
};
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 611bfe3..047baa3 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -708,6 +708,7 @@ bool ConstructDecompositionT<C, H>::applyClause(
tomp::clause::MapT<TypeTy, IdTy, ExprTy>{
{/*MapType=*/MapType::Tofrom,
/*MapTypeModifier=*/std::nullopt,
+ /*RefModifier=*/std::nullopt,
/*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
/*LocatorList=*/std::move(tofrom)}});
dirTarget->clauses.push_back(map);
@@ -969,8 +970,8 @@ bool ConstructDecompositionT<C, H>::applyClause(
llvm::omp::Clause::OMPC_map,
tomp::clause::MapT<TypeTy, IdTy, ExprTy>{
{/*MapType=*/MapType::Tofrom, /*MapTypeModifier=*/std::nullopt,
- /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
- /*LocatorList=*/std::move(tofrom)}});
+ /*RefModifier=*/std::nullopt, /*Mapper=*/std::nullopt,
+ /*Iterator=*/std::nullopt, /*LocatorList=*/std::move(tofrom)}});
dirTarget->clauses.push_back(map);
applied = true;
diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h
index d68491e..ef761eb 100644
--- a/llvm/include/llvm/IR/CallingConv.h
+++ b/llvm/include/llvm/IR/CallingConv.h
@@ -284,6 +284,9 @@ namespace CallingConv {
RISCV_VLSCall_32768 = 122,
RISCV_VLSCall_65536 = 123,
+ // Calling convention for AMDGPU whole wave functions.
+ AMDGPU_Gfx_WholeWave = 124,
+
/// The highest possible ID. Must be some 2^k - 1.
MaxID = 1023
};
@@ -294,8 +297,13 @@ namespace CallingConv {
/// directly or indirectly via a call-like instruction.
constexpr bool isCallableCC(CallingConv::ID CC) {
switch (CC) {
+ // Called with special intrinsics:
+ // llvm.amdgcn.cs.chain
case CallingConv::AMDGPU_CS_Chain:
case CallingConv::AMDGPU_CS_ChainPreserve:
+ // llvm.amdgcn.call.whole.wave
+ case CallingConv::AMDGPU_Gfx_WholeWave:
+ // Hardware entry points:
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_GS:
diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index f8241a3..c529a86 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -39,30 +39,26 @@ class DbgVariableRecord;
class Instruction;
class Module;
-/// Finds dbg.declare intrinsics declaring local variables as living in the
+/// Finds dbg.declare records declaring local variables as living in the
/// memory that 'V' points to.
-LLVM_ABI TinyPtrVector<DbgDeclareInst *> findDbgDeclares(Value *V);
-/// As above, for DVRDeclares.
LLVM_ABI TinyPtrVector<DbgVariableRecord *> findDVRDeclares(Value *V);
/// As above, for DVRValues.
LLVM_ABI TinyPtrVector<DbgVariableRecord *> findDVRValues(Value *V);
-/// Finds the llvm.dbg.value intrinsics describing a value.
-LLVM_ABI void findDbgValues(
- SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V,
- SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords = nullptr);
-
-/// Finds the debug info intrinsics describing a value.
-LLVM_ABI void findDbgUsers(
- SmallVectorImpl<DbgVariableIntrinsic *> &DbgInsts, Value *V,
- SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords = nullptr);
+/// Finds the debug info records describing a value.
+LLVM_ABI void
+findDbgUsers(Value *V,
+ SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords);
+/// Finds the dbg.values describing a value.
+LLVM_ABI void
+findDbgValues(Value *V,
+ SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords);
/// Find subprogram that is enclosing this scope.
LLVM_ABI DISubprogram *getDISubprogram(const MDNode *Scope);
/// Produce a DebugLoc to use for each dbg.declare that is promoted to a
/// dbg.value.
-LLVM_ABI DebugLoc getDebugValueLoc(DbgVariableIntrinsic *DII);
LLVM_ABI DebugLoc getDebugValueLoc(DbgVariableRecord *DVR);
/// Strip debug info in the module if it exists.
@@ -192,13 +188,6 @@ using AssignmentInstRange =
/// Iterators invalidated by adding or removing DIAssignID metadata to/from any
/// instruction (including by deleting or cloning instructions).
LLVM_ABI AssignmentInstRange getAssignmentInsts(DIAssignID *ID);
-/// Return a range of instructions (typically just one) that perform the
-/// assignment that \p DAI encodes.
-/// Iterators invalidated by adding or removing DIAssignID metadata to/from any
-/// instruction (including by deleting or cloning instructions).
-inline AssignmentInstRange getAssignmentInsts(const DbgAssignIntrinsic *DAI) {
- return getAssignmentInsts(DAI->getAssignID());
-}
inline AssignmentInstRange getAssignmentInsts(const DbgVariableRecord *DVR) {
assert(DVR->isDbgAssign() &&
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ecda6c4..3a7db6d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -10,6 +10,7 @@
//
//===----------------------------------------------------------------------===//
+def flat_ptr_ty : LLVMQualPointerType<0>;
def global_ptr_ty : LLVMQualPointerType<1>;
def local_ptr_ty : LLVMQualPointerType<3>;
@@ -3045,6 +3046,24 @@ def int_amdgcn_ds_bpermute_fi_b32 :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+def int_amdgcn_flat_prefetch : ClangBuiltin<"__builtin_amdgcn_flat_prefetch">,
+ Intrinsic<[],
+ [llvm_ptr_ty, // Pointer
+ llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
+ IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>],
+ "", [SDNPMemOperand]
+ >;
+
+def int_amdgcn_global_prefetch : ClangBuiltin<"__builtin_amdgcn_global_prefetch">,
+ Intrinsic<[],
+ [LLVMQualPointerType<1>, // Pointer
+ llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
+ IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>],
+ "", [SDNPMemOperand]
+ >;
+
//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
@@ -3717,6 +3736,20 @@ class AMDGPUWmmaIntrinsicModsAllDiff<LLVMType DstTy, LLVMType AB, LLVMType C> :
IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
+class AMDGPUWmmaIntrinsicModsC_MatrixFMT :
+ Intrinsic<
+ [llvm_anyfloat_ty], // %D
+ [
+ llvm_i32_ty, // matrix_a_fmt
+ llvm_anyint_ty, // %A
+ llvm_i32_ty, // matrix_b_fmt
+ llvm_anyint_ty, // %B
+ llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
+ LLVMMatchType<0>, // %C
+ ],
+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
+>;
+
defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
@@ -3741,6 +3774,7 @@ def int_amdgcn_wmma_f32_16x16x128_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint
def int_amdgcn_wmma_f32_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_i32_16x16x64_iu8 : AMDGPUWmmaIntrinsicModsAB<llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_f32_16x16x128_f8f6f4 : AMDGPUWmmaIntrinsicModsC_MatrixFMT;
def int_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUWmmaIntrinsicF4ModsC<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty>;
}
@@ -3813,6 +3847,26 @@ def int_amdgcn_tensor_load_to_lds_d2 :
def int_amdgcn_tensor_store_from_lds_d2 :
ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2;
+class AMDGPULoadMonitor<LLVMType ptr_ty>:
+ Intrinsic<
+ [llvm_any_ty],
+ [ptr_ty,
+ llvm_i32_ty], // gfx12+ cachepolicy:
+ // bits [0-2] = th
+ // bits [3-4] = scope
+ [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+ IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
+ "",
+ [SDNPMemOperand]
+ >;
+
+def int_amdgcn_flat_load_monitor_b32 : AMDGPULoadMonitor<flat_ptr_ty>;
+def int_amdgcn_flat_load_monitor_b64 : AMDGPULoadMonitor<flat_ptr_ty>;
+def int_amdgcn_flat_load_monitor_b128 : AMDGPULoadMonitor<flat_ptr_ty>;
+def int_amdgcn_global_load_monitor_b32 : AMDGPULoadMonitor<global_ptr_ty>;
+def int_amdgcn_global_load_monitor_b64 : AMDGPULoadMonitor<global_ptr_ty>;
+def int_amdgcn_global_load_monitor_b128 : AMDGPULoadMonitor<global_ptr_ty>;
+
/// Emit an addrspacecast without null pointer checking.
/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 5ddc144..967d166 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -331,6 +331,11 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType> {
!eq(gf,"m8n16:x2") : !listsplat(llvm_i32_ty, 2),
!eq(gf,"m8n16:x4") : !listsplat(llvm_i32_ty, 4),
+ // stmatrix b8 -> s32 @ m16n8
+ !eq(gf,"m16n8:x1") : !listsplat(llvm_i32_ty, 1),
+ !eq(gf,"m16n8:x2") : !listsplat(llvm_i32_ty, 2),
+ !eq(gf,"m16n8:x4") : !listsplat(llvm_i32_ty, 4),
+
);
}
@@ -403,6 +408,17 @@ class LDMATRIX_NAME<WMMA_REGS Frag, int Trans> {
!subst("llvm.", "int_", intr));
}
+class STMATRIX_NAME<WMMA_REGS Frag, int Trans> {
+ string intr = "llvm.nvvm.stmatrix.sync.aligned"
+ # "." # Frag.geom
+ # "." # Frag.frag
+ # !if(Trans, ".trans", "")
+ # "." # Frag.ptx_elt_type
+ ;
+ string record = !subst(".", "_",
+ !subst("llvm.", "int_", intr));
+}
+
// Generates list of 4-tuples of WMMA_REGS representing a valid MMA op.
// Geom: list of supported geometries.
// TypeN: PTX type of the corresponding fragment's element.
@@ -443,6 +459,16 @@ class LDMATRIX_OPS<list<string> Geom, list<string> Frags, list<string> Types> {
list<string> ops = !foreach(x, ret, x.gft);
}
+class STMATRIX_OPS<list<string> Geom, list<string> Frags, list<string> Types> {
+ list<WMMA_REGS> ret =
+ !foldl([]<WMMA_REGS>, Geom, t1, geom, !listconcat(t1,
+ !foldl([]<WMMA_REGS>, Frags, t2, frag, !listconcat(t2,
+ !foldl([]<WMMA_REGS>, Types, t3, type, !listconcat(t3,
+ [WMMA_REGS<geom, frag, type>]))))));
+ // Debugging aid for readable representation of the list above.
+ list<string> ops = !foreach(x, ret, x.gft);
+}
+
// Creates list of valid combinations of fragments. This is the main list that
// drives generation of corresponding intrinsics and instructions.
class NVVM_MMA_OPS {
@@ -537,9 +563,18 @@ class NVVM_MMA_OPS {
list<WMMA_REGS> ldmatrix_geom_m8n16_ops = LDMATRIX_OPS<
["m8n16"], ["x1", "x2", "x4"], ["b8x16.b6x16_p32", "b8x16.b4x16_p64"]>.ret;
+ list<WMMA_REGS> stmatrix_b16_ops = STMATRIX_OPS<
+ ["m8n8"], ["x1", "x2", "x4"], ["b16"]>.ret;
+
+ list<WMMA_REGS> stmatrix_b8_ops = STMATRIX_OPS<
+ ["m16n8"], ["x1", "x2", "x4"], ["b8"]>.ret;
+
list<WMMA_REGS> all_ldmatrix_ops = !listconcat(ldmatrix_b16_ops,
ldmatrix_geom_m16n16_ops,
ldmatrix_geom_m8n16_ops);
+
+ list<WMMA_REGS> all_stmatrix_ops = !listconcat(stmatrix_b16_ops,
+ stmatrix_b8_ops);
}
def NVVM_MMA_OPS : NVVM_MMA_OPS;
@@ -680,6 +715,19 @@ class NVVM_LDMATRIX_SUPPORTED<WMMA_REGS frag, bit trans> {
);
}
+// Returns true if the fragment is valid for stmatrix ops is supported;
+// false otherwise.
+class NVVM_STMATRIX_SUPPORTED<WMMA_REGS frag, bit trans> {
+ string g = frag.geom;
+ string t = frag.ptx_elt_type;
+
+ bit ret = !cond(
+ !and(!eq(g, "m8n8"), !eq(t, "b16")): true,
+ !and(!eq(g, "m16n8"), !eq(t, "b8"), !eq(trans, 1)): true,
+ true: false
+ );
+}
+
class SHFL_INFO<bit sync, string mode, string type, bit return_pred> {
string Suffix = !if(sync, "sync_", "")
# mode # "_"
@@ -1969,6 +2017,23 @@ foreach transposed = [0, 1] in {
}
}
+// STMATRIX
+class NVVM_STMATRIX<WMMA_REGS Frag, int Transposed>
+ : Intrinsic<[],
+ !listconcat([llvm_anyptr_ty], Frag.regs),
+ [IntrWriteMem, IntrArgMemOnly, IntrNoCallback,
+ WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>],
+ STMATRIX_NAME<Frag, Transposed>.intr>;
+
+foreach transposed = [0, 1] in {
+ foreach frag = NVVM_MMA_OPS.all_stmatrix_ops in {
+ if NVVM_STMATRIX_SUPPORTED<frag, transposed>.ret then {
+ def STMATRIX_NAME<frag, transposed>.record
+ : NVVM_STMATRIX<frag, transposed>;
+ }
+ }
+}
+
// MAPA
let IntrProperties = [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] in {
def int_nvvm_mapa
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index f592ff2..c1e4b97 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -43,6 +43,10 @@ def int_wasm_ref_is_null_exn :
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_exnref_ty], [IntrNoMem],
"llvm.wasm.ref.is_null.exn">;
+def int_wasm_ref_test_func
+ : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_vararg_ty],
+ [IntrNoMem]>;
+
//===----------------------------------------------------------------------===//
// Table intrinsics
//===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index 737610b..0fd5de3 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -112,7 +112,6 @@ inline bool FPToIntegerIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
return false;
}
llvm_unreachable("Checking FTZ flag for invalid f2i/d2i intrinsic");
- return false;
}
inline bool FPToIntegerIntrinsicResultIsSigned(Intrinsic::ID IntrinsicID) {
@@ -179,7 +178,6 @@ inline bool FPToIntegerIntrinsicResultIsSigned(Intrinsic::ID IntrinsicID) {
}
llvm_unreachable(
"Checking invalid f2i/d2i intrinsic for signed int conversion");
- return false;
}
inline APFloat::roundingMode
@@ -250,7 +248,6 @@ GetFPToIntegerRoundingMode(Intrinsic::ID IntrinsicID) {
return APFloat::rmTowardZero;
}
llvm_unreachable("Checking rounding mode for invalid f2i/d2i intrinsic");
- return APFloat::roundingMode::Invalid;
}
inline bool FMinFMaxShouldFTZ(Intrinsic::ID IntrinsicID) {
@@ -280,7 +277,6 @@ inline bool FMinFMaxShouldFTZ(Intrinsic::ID IntrinsicID) {
return false;
}
llvm_unreachable("Checking FTZ flag for invalid fmin/fmax intrinsic");
- return false;
}
inline bool FMinFMaxPropagatesNaNs(Intrinsic::ID IntrinsicID) {
@@ -310,7 +306,6 @@ inline bool FMinFMaxPropagatesNaNs(Intrinsic::ID IntrinsicID) {
return false;
}
llvm_unreachable("Checking NaN flag for invalid fmin/fmax intrinsic");
- return false;
}
inline bool FMinFMaxIsXorSignAbs(Intrinsic::ID IntrinsicID) {
@@ -340,7 +335,83 @@ inline bool FMinFMaxIsXorSignAbs(Intrinsic::ID IntrinsicID) {
return false;
}
llvm_unreachable("Checking XorSignAbs flag for invalid fmin/fmax intrinsic");
- return false;
+}
+
+inline bool UnaryMathIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
+ switch (IntrinsicID) {
+ case Intrinsic::nvvm_ceil_ftz_f:
+ case Intrinsic::nvvm_fabs_ftz:
+ case Intrinsic::nvvm_floor_ftz_f:
+ case Intrinsic::nvvm_round_ftz_f:
+ case Intrinsic::nvvm_saturate_ftz_f:
+ case Intrinsic::nvvm_sqrt_rn_ftz_f:
+ return true;
+ case Intrinsic::nvvm_ceil_f:
+ case Intrinsic::nvvm_ceil_d:
+ case Intrinsic::nvvm_fabs:
+ case Intrinsic::nvvm_floor_f:
+ case Intrinsic::nvvm_floor_d:
+ case Intrinsic::nvvm_round_f:
+ case Intrinsic::nvvm_round_d:
+ case Intrinsic::nvvm_saturate_d:
+ case Intrinsic::nvvm_saturate_f:
+ case Intrinsic::nvvm_sqrt_f:
+ case Intrinsic::nvvm_sqrt_rn_d:
+ case Intrinsic::nvvm_sqrt_rn_f:
+ return false;
+ }
+ llvm_unreachable("Checking FTZ flag for invalid unary intrinsic");
+}
+
+inline bool RCPShouldFTZ(Intrinsic::ID IntrinsicID) {
+ switch (IntrinsicID) {
+ case Intrinsic::nvvm_rcp_rm_ftz_f:
+ case Intrinsic::nvvm_rcp_rn_ftz_f:
+ case Intrinsic::nvvm_rcp_rp_ftz_f:
+ case Intrinsic::nvvm_rcp_rz_ftz_f:
+ return true;
+ case Intrinsic::nvvm_rcp_rm_d:
+ case Intrinsic::nvvm_rcp_rm_f:
+ case Intrinsic::nvvm_rcp_rn_d:
+ case Intrinsic::nvvm_rcp_rn_f:
+ case Intrinsic::nvvm_rcp_rp_d:
+ case Intrinsic::nvvm_rcp_rp_f:
+ case Intrinsic::nvvm_rcp_rz_d:
+ case Intrinsic::nvvm_rcp_rz_f:
+ return false;
+ }
+ llvm_unreachable("Checking FTZ flag for invalid rcp intrinsic");
+}
+
+inline APFloat::roundingMode GetRCPRoundingMode(Intrinsic::ID IntrinsicID) {
+ switch (IntrinsicID) {
+ case Intrinsic::nvvm_rcp_rm_f:
+ case Intrinsic::nvvm_rcp_rm_d:
+ case Intrinsic::nvvm_rcp_rm_ftz_f:
+ return APFloat::rmTowardNegative;
+
+ case Intrinsic::nvvm_rcp_rn_f:
+ case Intrinsic::nvvm_rcp_rn_d:
+ case Intrinsic::nvvm_rcp_rn_ftz_f:
+ return APFloat::rmNearestTiesToEven;
+
+ case Intrinsic::nvvm_rcp_rp_f:
+ case Intrinsic::nvvm_rcp_rp_d:
+ case Intrinsic::nvvm_rcp_rp_ftz_f:
+ return APFloat::rmTowardPositive;
+
+ case Intrinsic::nvvm_rcp_rz_f:
+ case Intrinsic::nvvm_rcp_rz_d:
+ case Intrinsic::nvvm_rcp_rz_ftz_f:
+ return APFloat::rmTowardZero;
+ }
+ llvm_unreachable("Checking rounding mode for invalid rcp intrinsic");
+}
+
+inline DenormalMode GetNVVMDenromMode(bool ShouldFTZ) {
+ if (ShouldFTZ)
+ return DenormalMode::getPreserveSign();
+ return DenormalMode::getIEEE();
}
} // namespace nvvm
diff --git a/llvm/include/llvm/IR/PassInstrumentation.h b/llvm/include/llvm/IR/PassInstrumentation.h
index 0315715..33eda5a 100644
--- a/llvm/include/llvm/IR/PassInstrumentation.h
+++ b/llvm/include/llvm/IR/PassInstrumentation.h
@@ -164,7 +164,7 @@ public:
/// Add a class name to pass name mapping for use by pass instrumentation.
LLVM_ABI void addClassToPassName(StringRef ClassName, StringRef PassName);
- /// Get the pass name for a given pass class name.
+ /// Get the pass name for a given pass class name. Empty if no match found.
LLVM_ABI StringRef getPassNameForClassName(StringRef ClassName);
private:
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 50e50a9..27c5d5c 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -822,12 +822,52 @@ template <typename Class> struct bind_ty {
}
};
+/// Check whether the value has the given Class and matches the nested
+/// pattern. Capture it into the provided variable if successful.
+template <typename Class, typename MatchTy> struct bind_and_match_ty {
+ Class *&VR;
+ MatchTy Match;
+
+ bind_and_match_ty(Class *&V, const MatchTy &Match) : VR(V), Match(Match) {}
+
+ template <typename ITy> bool match(ITy *V) const {
+ auto *CV = dyn_cast<Class>(V);
+ if (CV && Match.match(V)) {
+ VR = CV;
+ return true;
+ }
+ return false;
+ }
+};
+
/// Match a value, capturing it if we match.
inline bind_ty<Value> m_Value(Value *&V) { return V; }
inline bind_ty<const Value> m_Value(const Value *&V) { return V; }
+/// Match against the nested pattern, and capture the value if we match.
+template <typename MatchTy>
+inline bind_and_match_ty<Value, MatchTy> m_Value(Value *&V,
+ const MatchTy &Match) {
+ return {V, Match};
+}
+
+/// Match against the nested pattern, and capture the value if we match.
+template <typename MatchTy>
+inline bind_and_match_ty<const Value, MatchTy> m_Value(const Value *&V,
+ const MatchTy &Match) {
+ return {V, Match};
+}
+
/// Match an instruction, capturing it if we match.
inline bind_ty<Instruction> m_Instruction(Instruction *&I) { return I; }
+
+/// Match against the nested pattern, and capture the instruction if we match.
+template <typename MatchTy>
+inline bind_and_match_ty<Instruction, MatchTy>
+m_Instruction(Instruction *&I, const MatchTy &Match) {
+ return {I, Match};
+}
+
/// Match a unary operator, capturing it if we match.
inline bind_ty<UnaryOperator> m_UnOp(UnaryOperator *&I) { return I; }
/// Match a binary operator, capturing it if we match.
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 2e231cf..31801da 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -119,7 +119,6 @@ LLVM_ABI void initializeExpandVariadicsPass(PassRegistry &);
LLVM_ABI void initializeExternalAAWrapperPassPass(PassRegistry &);
LLVM_ABI void initializeFEntryInserterLegacyPass(PassRegistry &);
LLVM_ABI void initializeFinalizeISelPass(PassRegistry &);
-LLVM_ABI void initializeFinalizeMachineBundlesPass(PassRegistry &);
LLVM_ABI void initializeFixIrreduciblePass(PassRegistry &);
LLVM_ABI void initializeFixupStatepointCallerSavedLegacyPass(PassRegistry &);
LLVM_ABI void initializeFlattenCFGLegacyPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/LinkAllIR.h b/llvm/include/llvm/LinkAllIR.h
index ceed784..894a8dd 100644
--- a/llvm/include/llvm/LinkAllIR.h
+++ b/llvm/include/llvm/LinkAllIR.h
@@ -21,6 +21,7 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
+#include "llvm/Support/AlwaysTrue.h"
#include "llvm/Support/DynamicLibrary.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/Memory.h"
@@ -29,19 +30,16 @@
#include "llvm/Support/Process.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/Signals.h"
-#include <cstdlib>
namespace {
struct ForceVMCoreLinking {
ForceVMCoreLinking() {
// We must reference VMCore in such a way that compilers will not
- // delete it all as dead code, even with whole program optimization,
- // yet is effectively a NO-OP. As the compiler isn't smart enough
- // to know that getenv() never returns -1, this will do the job.
+ // delete it all as dead code, even with whole program optimization.
// This is so that globals in the translation units where these functions
// are defined are forced to be initialized, populating various
// registries.
- if (std::getenv("bar") != (char*) -1)
+ if (llvm::getNonFoldableAlwaysTrue())
return;
llvm::LLVMContext Context;
(void)new llvm::Module("", Context);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index bae7f0d..f82a439 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -34,6 +34,7 @@
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/Support/AlwaysTrue.h"
#include "llvm/Support/Valgrind.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
@@ -54,14 +55,12 @@ class Triple;
namespace {
struct ForcePassLinking {
ForcePassLinking() {
- // We must reference the passes in such a way that compilers will not
- // delete it all as dead code, even with whole program optimization,
- // yet is effectively a NO-OP. As the compiler isn't smart enough
- // to know that getenv() never returns -1, this will do the job.
- // This is so that globals in the translation units where these functions
- // are defined are forced to be initialized, populating various
- // registries.
- if (std::getenv("bar") != (char *)-1)
+ // We must reference the passes in such a way that compilers will not delete
+ // it all as dead code, even with whole program optimization, yet is
+ // effectively a NO-OP. This is so that globals in the translation units
+ // where these functions are defined are forced to be initialized,
+ // populating various registries.
+ if (llvm::getNonFoldableAlwaysTrue())
return;
(void)llvm::createAtomicExpandLegacyPass();
diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h
index 4b6b42f..14a2429 100644
--- a/llvm/include/llvm/MC/DXContainerRootSignature.h
+++ b/llvm/include/llvm/MC/DXContainerRootSignature.h
@@ -6,6 +6,9 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_MC_DXCONTAINERROOTSIGNATURE_H
+#define LLVM_MC_DXCONTAINERROOTSIGNATURE_H
+
#include "llvm/BinaryFormat/DXContainer.h"
#include <cstdint>
#include <limits>
@@ -116,3 +119,5 @@ struct RootSignatureDesc {
};
} // namespace mcdxbc
} // namespace llvm
+
+#endif // LLVM_MC_DXCONTAINERROOTSIGNATURE_H
diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
index 0322cbe..bfc1175 100644
--- a/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/llvm/include/llvm/MC/MCAsmBackend.h
@@ -18,9 +18,7 @@
namespace llvm {
-class MCAlignFragment;
class MCFragment;
-class MCLEBFragment;
class MCSymbol;
class MCAssembler;
class MCContext;
@@ -60,6 +58,9 @@ protected: // Can only create subclasses.
MCAssembler *Asm = nullptr;
+ bool AllowAutoPadding = false;
+ bool AllowEnhancedRelaxation = false;
+
public:
MCAsmBackend(const MCAsmBackend &) = delete;
MCAsmBackend &operator=(const MCAsmBackend &) = delete;
@@ -73,11 +74,11 @@ public:
/// Return true if this target might automatically pad instructions and thus
/// need to emit padding enable/disable directives around sensative code.
- virtual bool allowAutoPadding() const { return false; }
+ bool allowAutoPadding() const { return AllowAutoPadding; }
/// Return true if this target allows an unrelaxable instruction to be
/// emitted into RelaxableFragment and then we can increase its size in a
/// tricky way for optimization.
- virtual bool allowEnhancedRelaxation() const { return false; }
+ bool allowEnhancedRelaxation() const { return AllowEnhancedRelaxation; }
/// lifetime management
virtual void reset() {}
@@ -105,21 +106,6 @@ public:
/// Get information on a fixup kind.
virtual MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const;
- /// Hook to check if extra nop bytes must be inserted for alignment directive.
- /// For some targets this may be necessary in order to support linker
- /// relaxation. The number of bytes to insert are returned in Size.
- virtual bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF,
- unsigned &Size) {
- return false;
- }
-
- /// Hook which indicates if the target requires a fixup to be generated when
- /// handling an align directive in an executable section
- virtual bool shouldInsertFixupForCodeAlign(MCAssembler &Asm,
- MCAlignFragment &AF) {
- return false;
- }
-
// Evaluate a fixup, returning std::nullopt to use default handling for
// `Value` and `IsResolved`. Otherwise, returns `IsResolved` with the
// expectation that the hook updates `Value`.
@@ -177,6 +163,10 @@ public:
}
// Defined by linker relaxation targets.
+
+ // Return false to use default handling. Otherwise, set `Size` to the number
+ // of padding bytes.
+ virtual bool relaxAlign(MCFragment &F, unsigned &Size) { return false; }
virtual bool relaxDwarfLineAddr(MCFragment &, bool &WasRelaxed) const {
return false;
}
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index 319e131..aea93e9 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -40,6 +40,7 @@ class MCObjectStreamer : public MCStreamer {
std::unique_ptr<MCAssembler> Assembler;
bool EmitEHFrame;
bool EmitDebugFrame;
+ bool EmitSFrame;
struct PendingAssignment {
MCSymbol *Symbol;
@@ -54,7 +55,6 @@ class MCObjectStreamer : public MCStreamer {
void emitInstToData(const MCInst &Inst, const MCSubtargetInfo &);
void emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
void emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
- void emitInstructionImpl(const MCInst &Inst, const MCSubtargetInfo &STI);
protected:
MCObjectStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
@@ -71,14 +71,7 @@ public:
void emitFrames(MCAsmBackend *MAB);
MCSymbol *emitCFILabel() override;
- void emitCFISections(bool EH, bool Debug) override;
-
- /// Get a data fragment to write into, creating a new one if the current
- /// fragment is not FT_Data.
- MCFragment *getOrCreateDataFragment();
-
-protected:
- bool changeSectionImpl(MCSection *Section, uint32_t Subsection);
+ void emitCFISections(bool EH, bool Debug, bool SFrame) override;
public:
void visitUsedSymbol(const MCSymbol &Sym) override;
@@ -88,6 +81,15 @@ public:
/// \name MCStreamer Interface
/// @{
+ // Add a fragment with a variable-size tail and start a new empty fragment.
+ void insert(MCFragment *F);
+
+ // Add a new fragment to the current section without a variable-size tail.
+ void newFragment();
+
+ void appendContents(size_t Num, char Elt);
+ void addFixup(const MCExpr *Value, MCFixupKind Kind);
+
void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
virtual void emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc, MCFragment &F,
uint64_t Offset);
diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h
index 66ea8f8..c1f3f02 100644
--- a/llvm/include/llvm/MC/MCSection.h
+++ b/llvm/include/llvm/MC/MCSection.h
@@ -39,150 +39,6 @@ class MCSubtargetInfo;
class raw_ostream;
class Triple;
-/// Instances of this class represent a uniqued identifier for a section in the
-/// current translation unit. The MCContext class uniques and creates these.
-class LLVM_ABI MCSection {
-public:
- friend MCAssembler;
- friend MCObjectStreamer;
- friend class MCFragment;
- static constexpr unsigned NonUniqueID = ~0U;
-
- enum SectionVariant {
- SV_COFF = 0,
- SV_ELF,
- SV_GOFF,
- SV_MachO,
- SV_Wasm,
- SV_XCOFF,
- SV_SPIRV,
- SV_DXContainer,
- };
-
- struct iterator {
- MCFragment *F = nullptr;
- iterator() = default;
- explicit iterator(MCFragment *F) : F(F) {}
- MCFragment &operator*() const { return *F; }
- bool operator==(const iterator &O) const { return F == O.F; }
- bool operator!=(const iterator &O) const { return F != O.F; }
- iterator &operator++();
- };
-
- struct FragList {
- MCFragment *Head = nullptr;
- MCFragment *Tail = nullptr;
- };
-
-private:
- // At parse time, this holds the fragment list of the current subsection. At
- // layout time, this holds the concatenated fragment lists of all subsections.
- FragList *CurFragList;
- MCSymbol *Begin;
- MCSymbol *End = nullptr;
- /// The alignment requirement of this section.
- Align Alignment;
- /// The section index in the assemblers section list.
- unsigned Ordinal = 0;
-
- /// Whether this section has had instructions emitted into it.
- bool HasInstructions : 1;
-
- bool IsRegistered : 1;
-
- bool IsText : 1;
-
- bool IsVirtual : 1;
-
- /// Whether the section contains linker-relaxable fragments. If true, the
- /// offset between two locations may not be fully resolved.
- bool LinkerRelaxable : 1;
-
- // Mapping from subsection number to fragment list. At layout time, the
- // subsection 0 list is replaced with concatenated fragments from all
- // subsections.
- SmallVector<std::pair<unsigned, FragList>, 1> Subsections;
-
- // Content and fixup storage for fragments
- SmallVector<char, 0> ContentStorage;
- SmallVector<MCFixup, 0> FixupStorage;
- SmallVector<MCOperand, 0> MCOperandStorage;
-
-protected:
- // TODO Make Name private when possible.
- StringRef Name;
- SectionVariant Variant;
-
- MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsVirtual,
- MCSymbol *Begin);
- // Protected non-virtual dtor prevents destroy through a base class pointer.
- ~MCSection() {}
-
-public:
- MCSection(const MCSection &) = delete;
- MCSection &operator=(const MCSection &) = delete;
-
- StringRef getName() const { return Name; }
- bool isText() const { return IsText; }
-
- SectionVariant getVariant() const { return Variant; }
-
- MCSymbol *getBeginSymbol() { return Begin; }
- const MCSymbol *getBeginSymbol() const {
- return const_cast<MCSection *>(this)->getBeginSymbol();
- }
- void setBeginSymbol(MCSymbol *Sym) {
- assert(!Begin);
- Begin = Sym;
- }
- MCSymbol *getEndSymbol(MCContext &Ctx);
- bool hasEnded() const;
-
- Align getAlign() const { return Alignment; }
- void setAlignment(Align Value) { Alignment = Value; }
-
- /// Makes sure that Alignment is at least MinAlignment.
- void ensureMinAlignment(Align MinAlignment) {
- if (Alignment < MinAlignment)
- Alignment = MinAlignment;
- }
-
- unsigned getOrdinal() const { return Ordinal; }
- void setOrdinal(unsigned Value) { Ordinal = Value; }
-
- bool hasInstructions() const { return HasInstructions; }
- void setHasInstructions(bool Value) { HasInstructions = Value; }
-
- bool isRegistered() const { return IsRegistered; }
- void setIsRegistered(bool Value) { IsRegistered = Value; }
-
- bool isLinkerRelaxable() const { return LinkerRelaxable; }
- void setLinkerRelaxable() { LinkerRelaxable = true; }
-
- MCFragment &getDummyFragment() { return *Subsections[0].second.Head; }
-
- FragList *curFragList() const { return CurFragList; }
- iterator begin() const { return iterator(CurFragList->Head); }
- iterator end() const { return {}; }
-
- void dump(DenseMap<const MCFragment *, SmallVector<const MCSymbol *, 0>>
- *FragToSyms = nullptr) const;
-
- virtual void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
- raw_ostream &OS,
- uint32_t Subsection) const = 0;
-
- /// Return true if a .align directive should use "optimized nops" to fill
- /// instead of 0s.
- virtual bool useCodeAlign() const = 0;
-
- /// Check whether this section is "virtual", that is has no actual object
- /// file contents.
- bool isVirtualSection() const { return IsVirtual; }
-
- virtual StringRef getVirtualSectionKind() const;
-};
-
// Represents a contiguous piece of code or data within a section. Its size is
// determined by MCAssembler::layout. All subclasses must have trivial
// destructors.
@@ -234,11 +90,16 @@ protected:
/// FT_Relaxable, x86-specific
bool AllowAutoPadding : 1;
+ // Track content and fixups for the fixed-size part as fragments are
+ // appended to the section. The content remains immutable, except when
+ // modified by applyFixup.
uint32_t ContentStart = 0;
uint32_t ContentEnd = 0;
uint32_t FixupStart = 0;
uint32_t FixupEnd = 0;
+ // Track content and fixups for the optional variable-size tail part,
+ // typically modified during relaxation.
uint32_t VarContentStart = 0;
uint32_t VarContentEnd = 0;
uint32_t VarFixupStart = 0;
@@ -255,6 +116,19 @@ protected:
uint32_t OperandSize;
} relax;
struct {
+ // The alignment to ensure, in bytes.
+ Align Alignment;
+ // The size of the integer (in bytes) of \p Value.
+ uint8_t FillLen;
+ // If true, fill with target-specific nop instructions.
+ bool EmitNops;
+ // The maximum number of bytes to emit; if the alignment
+ // cannot be satisfied in this width then this fragment is ignored.
+ unsigned MaxBytesToEmit;
+ // Value to use for filling padding bytes.
+ int64_t Fill;
+ } align;
+ struct {
// True if this is a sleb128, false if uleb128.
bool IsSigned;
// The value this fragment should contain.
@@ -283,6 +157,7 @@ public:
return false;
case MCFragment::FT_Relaxable:
case MCFragment::FT_Data:
+ case MCFragment::FT_Align:
case MCFragment::FT_Dwarf:
case MCFragment::FT_DwarfFrame:
case MCFragment::FT_LEB:
@@ -327,24 +202,13 @@ public:
bool getAllowAutoPadding() const { return AllowAutoPadding; }
void setAllowAutoPadding(bool V) { AllowAutoPadding = V; }
- // Content-related functions manage parent's storage using ContentStart and
+ //== Content-related functions manage parent's storage using ContentStart and
// ContentSize.
- void clearContents() { ContentEnd = ContentStart; }
+
// Get a SmallVector reference. The caller should call doneAppending to update
// `ContentEnd`.
- SmallVectorImpl<char> &getContentsForAppending() {
- SmallVectorImpl<char> &S = getParent()->ContentStorage;
- if (LLVM_UNLIKELY(ContentEnd != S.size())) {
- // Move the elements to the end. Reserve space to avoid invalidating
- // S.begin()+I for `append`.
- auto Size = ContentEnd - ContentStart;
- auto I = std::exchange(ContentStart, S.size());
- S.reserve(S.size() + Size);
- S.append(S.begin() + I, S.begin() + I + Size);
- }
- return S;
- }
- void doneAppending() { ContentEnd = getParent()->ContentStorage.size(); }
+ SmallVectorImpl<char> &getContentsForAppending();
+ void doneAppending();
void appendContents(ArrayRef<char> Contents) {
getContentsForAppending().append(Contents.begin(), Contents.end());
doneAppending();
@@ -353,26 +217,13 @@ public:
getContentsForAppending().append(Num, Elt);
doneAppending();
}
- LLVM_ABI void setContents(ArrayRef<char> Contents);
- MutableArrayRef<char> getContents() {
- return MutableArrayRef(getParent()->ContentStorage)
- .slice(ContentStart, ContentEnd - ContentStart);
- }
- ArrayRef<char> getContents() const {
- return ArrayRef(getParent()->ContentStorage)
- .slice(ContentStart, ContentEnd - ContentStart);
- }
+ MutableArrayRef<char> getContents();
+ ArrayRef<char> getContents() const;
void setVarContents(ArrayRef<char> Contents);
void clearVarContents() { setVarContents({}); }
- MutableArrayRef<char> getVarContents() {
- return MutableArrayRef(getParent()->ContentStorage)
- .slice(VarContentStart, VarContentEnd - VarContentStart);
- }
- ArrayRef<char> getVarContents() const {
- return ArrayRef(getParent()->ContentStorage)
- .slice(VarContentStart, VarContentEnd - VarContentStart);
- }
+ MutableArrayRef<char> getVarContents();
+ ArrayRef<char> getVarContents() const;
size_t getFixedSize() const { return ContentEnd - ContentStart; }
size_t getVarSize() const { return VarContentEnd - VarContentStart; }
@@ -385,59 +236,55 @@ public:
void clearFixups() { FixupEnd = FixupStart; }
LLVM_ABI void addFixup(MCFixup Fixup);
LLVM_ABI void appendFixups(ArrayRef<MCFixup> Fixups);
- LLVM_ABI void setFixups(ArrayRef<MCFixup> Fixups);
- MutableArrayRef<MCFixup> getFixups() {
- return MutableArrayRef(getParent()->FixupStorage)
- .slice(FixupStart, FixupEnd - FixupStart);
- }
- ArrayRef<MCFixup> getFixups() const {
- return ArrayRef(getParent()->FixupStorage)
- .slice(FixupStart, FixupEnd - FixupStart);
- }
+ MutableArrayRef<MCFixup> getFixups();
+ ArrayRef<MCFixup> getFixups() const;
// Source fixup offsets are relative to the variable part's start.
// Stored fixup offsets are relative to the fixed part's start.
void setVarFixups(ArrayRef<MCFixup> Fixups);
void clearVarFixups() { setVarFixups({}); }
- MutableArrayRef<MCFixup> getVarFixups() {
- return MutableArrayRef(getParent()->FixupStorage)
- .slice(VarFixupStart, VarFixupEnd - VarFixupStart);
- }
- ArrayRef<MCFixup> getVarFixups() const {
- return ArrayRef(getParent()->FixupStorage)
- .slice(VarFixupStart, VarFixupEnd - VarFixupStart);
- }
+ MutableArrayRef<MCFixup> getVarFixups();
+ ArrayRef<MCFixup> getVarFixups() const;
//== FT_Relaxable functions
unsigned getOpcode() const {
assert(Kind == FT_Relaxable);
return u.relax.Opcode;
}
- ArrayRef<MCOperand> getOperands() const {
- assert(Kind == FT_Relaxable);
- return MutableArrayRef(getParent()->MCOperandStorage)
- .slice(u.relax.OperandStart, u.relax.OperandSize);
+ ArrayRef<MCOperand> getOperands() const;
+ MCInst getInst() const;
+ void setInst(const MCInst &Inst);
+
+ //== FT_Align functions
+ void makeAlign(Align Alignment, int64_t Fill, uint8_t FillLen,
+ unsigned MaxBytesToEmit) {
+ Kind = FT_Align;
+ u.align.EmitNops = false;
+ u.align.Alignment = Alignment;
+ u.align.Fill = Fill;
+ u.align.FillLen = FillLen;
+ u.align.MaxBytesToEmit = MaxBytesToEmit;
}
- MCInst getInst() const {
- assert(Kind == FT_Relaxable);
- MCInst Inst;
- Inst.setOpcode(u.relax.Opcode);
- Inst.setFlags(u.relax.Flags);
- Inst.setOperands(ArrayRef(getParent()->MCOperandStorage)
- .slice(u.relax.OperandStart, u.relax.OperandSize));
- return Inst;
- }
- void setInst(const MCInst &Inst) {
- assert(Kind == FT_Relaxable);
- u.relax.Opcode = Inst.getOpcode();
- u.relax.Flags = Inst.getFlags();
- auto &S = getParent()->MCOperandStorage;
- if (Inst.getNumOperands() > u.relax.OperandSize) {
- u.relax.OperandStart = S.size();
- S.resize_for_overwrite(S.size() + Inst.getNumOperands());
- }
- u.relax.OperandSize = Inst.getNumOperands();
- llvm::copy(Inst, S.begin() + u.relax.OperandStart);
+
+ Align getAlignment() const {
+ assert(Kind == FT_Align);
+ return u.align.Alignment;
+ }
+ int64_t getAlignFill() const {
+ assert(Kind == FT_Align);
+ return u.align.Fill;
+ }
+ uint8_t getAlignFillLen() const {
+ assert(Kind == FT_Align);
+ return u.align.FillLen;
+ }
+ unsigned getAlignMaxBytesToEmit() const {
+ assert(Kind == FT_Align);
+ return u.align.MaxBytesToEmit;
+ }
+ bool hasAlignEmitNops() const {
+ assert(Kind == FT_Align);
+ return u.align.EmitNops;
}
//== FT_LEB functions
@@ -487,52 +334,6 @@ protected:
: MCFragment(FType, HasInstructions) {}
};
-class MCAlignFragment : public MCFragment {
- /// Flag to indicate that (optimal) NOPs should be emitted instead
- /// of using the provided value. The exact interpretation of this flag is
- /// target dependent.
- bool EmitNops : 1;
-
- /// The alignment to ensure, in bytes.
- Align Alignment;
-
- /// The size of the integer (in bytes) of \p Value.
- uint8_t FillLen;
-
- /// The maximum number of bytes to emit; if the alignment
- /// cannot be satisfied in this width then this fragment is ignored.
- unsigned MaxBytesToEmit;
-
- /// Value to use for filling padding bytes.
- int64_t Fill;
-
- /// When emitting Nops some subtargets have specific nop encodings.
- const MCSubtargetInfo *STI = nullptr;
-
-public:
- MCAlignFragment(Align Alignment, int64_t Fill, uint8_t FillLen,
- unsigned MaxBytesToEmit)
- : MCFragment(FT_Align, false), EmitNops(false), Alignment(Alignment),
- FillLen(FillLen), MaxBytesToEmit(MaxBytesToEmit), Fill(Fill) {}
-
- Align getAlignment() const { return Alignment; }
- int64_t getFill() const { return Fill; }
- uint8_t getFillLen() const { return FillLen; }
- unsigned getMaxBytesToEmit() const { return MaxBytesToEmit; }
-
- bool hasEmitNops() const { return EmitNops; }
- void setEmitNops(bool Value, const MCSubtargetInfo *STI) {
- EmitNops = Value;
- this->STI = STI;
- }
-
- const MCSubtargetInfo *getSubtargetInfo() const { return STI; }
-
- static bool classof(const MCFragment *F) {
- return F->getKind() == MCFragment::FT_Align;
- }
-};
-
class MCFillFragment : public MCFragment {
uint8_t ValueSize;
/// Value to use for filling bytes.
@@ -730,6 +531,228 @@ public:
}
};
+/// Instances of this class represent a uniqued identifier for a section in the
+/// current translation unit. The MCContext class uniques and creates these.
+class LLVM_ABI MCSection {
+public:
+ friend MCAssembler;
+ friend MCObjectStreamer;
+ friend class MCFragment;
+ static constexpr unsigned NonUniqueID = ~0U;
+
+ enum SectionVariant {
+ SV_COFF = 0,
+ SV_ELF,
+ SV_GOFF,
+ SV_MachO,
+ SV_Wasm,
+ SV_XCOFF,
+ SV_SPIRV,
+ SV_DXContainer,
+ };
+
+ struct iterator {
+ MCFragment *F = nullptr;
+ iterator() = default;
+ explicit iterator(MCFragment *F) : F(F) {}
+ MCFragment &operator*() const { return *F; }
+ bool operator==(const iterator &O) const { return F == O.F; }
+ bool operator!=(const iterator &O) const { return F != O.F; }
+ iterator &operator++();
+ };
+
+ struct FragList {
+ MCFragment *Head = nullptr;
+ MCFragment *Tail = nullptr;
+ };
+
+private:
+ // At parse time, this holds the fragment list of the current subsection. At
+ // layout time, this holds the concatenated fragment lists of all subsections.
+ FragList *CurFragList;
+ MCSymbol *Begin;
+ MCSymbol *End = nullptr;
+ /// The alignment requirement of this section.
+ Align Alignment;
+ /// The section index in the assemblers section list.
+ unsigned Ordinal = 0;
+
+ /// Whether this section has had instructions emitted into it.
+ bool HasInstructions : 1;
+
+ bool IsRegistered : 1;
+
+ bool IsText : 1;
+ bool IsBss : 1;
+
+ /// Whether the section contains linker-relaxable fragments. If true, the
+ /// offset between two locations may not be fully resolved.
+ bool LinkerRelaxable : 1;
+
+ // Mapping from subsection number to fragment list. At layout time, the
+ // subsection 0 list is replaced with concatenated fragments from all
+ // subsections.
+ SmallVector<std::pair<unsigned, FragList>, 1> Subsections;
+
+ // Content and fixup storage for fragments
+ SmallVector<char, 0> ContentStorage;
+ SmallVector<MCFixup, 0> FixupStorage;
+ SmallVector<MCOperand, 0> MCOperandStorage;
+
+protected:
+ // TODO Make Name private when possible.
+ StringRef Name;
+ SectionVariant Variant;
+
+ MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsBss,
+ MCSymbol *Begin);
+ // Protected non-virtual dtor prevents destroy through a base class pointer.
+ ~MCSection() {}
+
+public:
+ MCSection(const MCSection &) = delete;
+ MCSection &operator=(const MCSection &) = delete;
+
+ StringRef getName() const { return Name; }
+ bool isText() const { return IsText; }
+
+ SectionVariant getVariant() const { return Variant; }
+
+ MCSymbol *getBeginSymbol() { return Begin; }
+ const MCSymbol *getBeginSymbol() const {
+ return const_cast<MCSection *>(this)->getBeginSymbol();
+ }
+ void setBeginSymbol(MCSymbol *Sym) {
+ assert(!Begin);
+ Begin = Sym;
+ }
+ MCSymbol *getEndSymbol(MCContext &Ctx);
+ bool hasEnded() const;
+
+ Align getAlign() const { return Alignment; }
+ void setAlignment(Align Value) { Alignment = Value; }
+
+ /// Makes sure that Alignment is at least MinAlignment.
+ void ensureMinAlignment(Align MinAlignment) {
+ if (Alignment < MinAlignment)
+ Alignment = MinAlignment;
+ }
+
+ unsigned getOrdinal() const { return Ordinal; }
+ void setOrdinal(unsigned Value) { Ordinal = Value; }
+
+ bool hasInstructions() const { return HasInstructions; }
+ void setHasInstructions(bool Value) { HasInstructions = Value; }
+
+ bool isRegistered() const { return IsRegistered; }
+ void setIsRegistered(bool Value) { IsRegistered = Value; }
+
+ bool isLinkerRelaxable() const { return LinkerRelaxable; }
+ void setLinkerRelaxable() { LinkerRelaxable = true; }
+
+ MCFragment &getDummyFragment() { return *Subsections[0].second.Head; }
+
+ FragList *curFragList() const { return CurFragList; }
+ iterator begin() const { return iterator(CurFragList->Head); }
+ iterator end() const { return {}; }
+
+ void dump(DenseMap<const MCFragment *, SmallVector<const MCSymbol *, 0>>
+ *FragToSyms = nullptr) const;
+
+ virtual void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+ raw_ostream &OS,
+ uint32_t Subsection) const = 0;
+
+ /// Return true if a .align directive should use "optimized nops" to fill
+ /// instead of 0s.
+ virtual bool useCodeAlign() const = 0;
+
+ /// Check whether this section is "virtual", that is has no actual object
+ /// file contents.
+ bool isBssSection() const { return IsBss; }
+};
+
+inline SmallVectorImpl<char> &MCFragment::getContentsForAppending() {
+ SmallVectorImpl<char> &S = getParent()->ContentStorage;
+ if (LLVM_UNLIKELY(ContentEnd != S.size())) {
+ // Move the elements to the end. Reserve space to avoid invalidating
+ // S.begin()+I for `append`.
+ auto Size = ContentEnd - ContentStart;
+ auto I = std::exchange(ContentStart, S.size());
+ S.reserve(S.size() + Size);
+ S.append(S.begin() + I, S.begin() + I + Size);
+ }
+ return S;
+}
+inline void MCFragment::doneAppending() {
+ ContentEnd = getParent()->ContentStorage.size();
+}
+inline MutableArrayRef<char> MCFragment::getContents() {
+ return MutableArrayRef(getParent()->ContentStorage)
+ .slice(ContentStart, ContentEnd - ContentStart);
+}
+inline ArrayRef<char> MCFragment::getContents() const {
+ return ArrayRef(getParent()->ContentStorage)
+ .slice(ContentStart, ContentEnd - ContentStart);
+}
+
+inline MutableArrayRef<char> MCFragment::getVarContents() {
+ return MutableArrayRef(getParent()->ContentStorage)
+ .slice(VarContentStart, VarContentEnd - VarContentStart);
+}
+inline ArrayRef<char> MCFragment::getVarContents() const {
+ return ArrayRef(getParent()->ContentStorage)
+ .slice(VarContentStart, VarContentEnd - VarContentStart);
+}
+
+//== Fixup-related functions manage parent's storage using FixupStart and
+// FixupSize.
+inline MutableArrayRef<MCFixup> MCFragment::getFixups() {
+ return MutableArrayRef(getParent()->FixupStorage)
+ .slice(FixupStart, FixupEnd - FixupStart);
+}
+inline ArrayRef<MCFixup> MCFragment::getFixups() const {
+ return ArrayRef(getParent()->FixupStorage)
+ .slice(FixupStart, FixupEnd - FixupStart);
+}
+
+inline MutableArrayRef<MCFixup> MCFragment::getVarFixups() {
+ return MutableArrayRef(getParent()->FixupStorage)
+ .slice(VarFixupStart, VarFixupEnd - VarFixupStart);
+}
+inline ArrayRef<MCFixup> MCFragment::getVarFixups() const {
+ return ArrayRef(getParent()->FixupStorage)
+ .slice(VarFixupStart, VarFixupEnd - VarFixupStart);
+}
+
+//== FT_Relaxable functions
+inline ArrayRef<MCOperand> MCFragment::getOperands() const {
+ assert(Kind == FT_Relaxable);
+ return MutableArrayRef(getParent()->MCOperandStorage)
+ .slice(u.relax.OperandStart, u.relax.OperandSize);
+}
+inline MCInst MCFragment::getInst() const {
+ assert(Kind == FT_Relaxable);
+ MCInst Inst;
+ Inst.setOpcode(u.relax.Opcode);
+ Inst.setFlags(u.relax.Flags);
+ Inst.setOperands(ArrayRef(getParent()->MCOperandStorage)
+ .slice(u.relax.OperandStart, u.relax.OperandSize));
+ return Inst;
+}
+inline void MCFragment::setInst(const MCInst &Inst) {
+ assert(Kind == FT_Relaxable);
+ u.relax.Opcode = Inst.getOpcode();
+ u.relax.Flags = Inst.getFlags();
+ auto &S = getParent()->MCOperandStorage;
+ if (Inst.getNumOperands() > u.relax.OperandSize) {
+ u.relax.OperandStart = S.size();
+ S.resize_for_overwrite(S.size() + Inst.getNumOperands());
+ }
+ u.relax.OperandSize = Inst.getNumOperands();
+ llvm::copy(Inst, S.begin() + u.relax.OperandStart);
+}
+
inline MCSection::iterator &MCSection::iterator::operator++() {
F = F->Next;
return *this;
diff --git a/llvm/include/llvm/MC/MCSectionCOFF.h b/llvm/include/llvm/MC/MCSectionCOFF.h
index 4472a12..f979413a 100644
--- a/llvm/include/llvm/MC/MCSectionCOFF.h
+++ b/llvm/include/llvm/MC/MCSectionCOFF.h
@@ -82,7 +82,6 @@ public:
raw_ostream &OS,
uint32_t Subsection) const override;
bool useCodeAlign() const override;
- StringRef getVirtualSectionKind() const override;
unsigned getOrAssignWinCFISectionID(unsigned *NextID) const {
if (WinCFISectionID == ~0U)
diff --git a/llvm/include/llvm/MC/MCSectionELF.h b/llvm/include/llvm/MC/MCSectionELF.h
index f09d305..64a4daf 100644
--- a/llvm/include/llvm/MC/MCSectionELF.h
+++ b/llvm/include/llvm/MC/MCSectionELF.h
@@ -68,10 +68,6 @@ private:
Group.getPointer()->setIsSignature();
}
- // TODO Delete after we stop supporting generation of GNU-style .zdebug_*
- // sections.
- void setSectionName(StringRef Name) { this->Name = Name; }
-
public:
/// Decides whether a '.section' directive should be printed before the
/// section name
@@ -88,7 +84,6 @@ public:
raw_ostream &OS,
uint32_t Subsection) const override;
bool useCodeAlign() const override;
- StringRef getVirtualSectionKind() const override;
bool isUnique() const { return UniqueID != NonUniqueID; }
unsigned getUniqueID() const { return UniqueID; }
diff --git a/llvm/include/llvm/MC/MCSectionGOFF.h b/llvm/include/llvm/MC/MCSectionGOFF.h
index 9e3f95e..b166397 100644
--- a/llvm/include/llvm/MC/MCSectionGOFF.h
+++ b/llvm/include/llvm/MC/MCSectionGOFF.h
@@ -111,7 +111,7 @@ public:
// Returns the text style for a section. Only defined for ED and PR sections.
GOFF::ESDTextStyle getTextStyle() const {
- assert((isED() || isPR() || isVirtualSection()) && "Expect ED or PR section");
+ assert((isED() || isPR() || isBssSection()) && "Expect ED or PR section");
if (isED())
return EDAttributes.TextStyle;
if (isPR())
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 4b91dbc..dfaf348 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -259,6 +259,8 @@ class LLVM_ABI MCStreamer {
bool AllowAutoPadding = false;
protected:
+ bool IsObj = false;
+
// Symbol of the current epilog for which we are processing SEH directives.
WinEH::FrameInfo::Epilog *CurrentWinEpilog = nullptr;
@@ -270,6 +272,8 @@ protected:
/// section changes.
virtual void changeSection(MCSection *, uint32_t);
+ void addFragment(MCFragment *F);
+
virtual void emitCFIStartProcImpl(MCDwarfFrameInfo &Frame);
virtual void emitCFIEndProcImpl(MCDwarfFrameInfo &CurFrame);
@@ -308,6 +312,7 @@ public:
virtual void reset();
MCContext &getContext() const { return Context; }
+ bool isObj() const { return IsObj; }
// MCObjectStreamer has an MCAssembler and allows more expression folding at
// parse time.
@@ -425,10 +430,15 @@ public:
}
MCFragment *getCurrentFragment() const {
+ // Ensure consistency with the section stack.
assert(!getCurrentSection().first ||
CurFrag->getParent() == getCurrentSection().first);
+ // Ensure we eagerly allocate an empty fragment after adding fragment with a
+ // variable-size tail.
+ assert(!CurFrag || CurFrag->getKind() == MCFragment::FT_Data);
return CurFrag;
}
+ size_t getCurFragOffset() const { return getCurrentFragment()->Offset; }
/// Save the current and previous section on the section stack.
void pushSection() {
SectionStack.push_back(
@@ -456,9 +466,6 @@ public:
MCSymbol *endSection(MCSection *Section);
- void insert(MCFragment *F);
- void newFragment();
-
/// Returns the mnemonic for \p MI, if the streamer has access to a
/// instruction printer and returns an empty string otherwise.
virtual StringRef getMnemonic(const MCInst &MI) const { return ""; }
@@ -979,7 +986,7 @@ public:
const MCSymbol *Lo);
virtual MCSymbol *getDwarfLineTableSymbol(unsigned CUID);
- virtual void emitCFISections(bool EH, bool Debug);
+ virtual void emitCFISections(bool EH, bool Debug, bool SFrame);
void emitCFIStartProc(bool IsSimple, SMLoc Loc = SMLoc());
void emitCFIEndProc();
virtual void emitCFIDefCfa(int64_t Register, int64_t Offset, SMLoc Loc = {});
diff --git a/llvm/include/llvm/MC/MCTargetOptions.h b/llvm/include/llvm/MC/MCTargetOptions.h
index d95adf9..235d58d 100644
--- a/llvm/include/llvm/MC/MCTargetOptions.h
+++ b/llvm/include/llvm/MC/MCTargetOptions.h
@@ -102,6 +102,9 @@ public:
// functions on Darwins.
bool EmitCompactUnwindNonCanonical : 1;
+ // Whether to emit SFrame unwind sections.
+ bool EmitSFrameUnwind : 1;
+
// Whether or not to use full register names on PowerPC.
bool PPCUseFullRegisterNames : 1;
diff --git a/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h b/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h
index b057eff..adfdccd 100644
--- a/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h
+++ b/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h
@@ -40,6 +40,8 @@ LLVM_ABI EmitDwarfUnwindType getEmitDwarfUnwind();
LLVM_ABI bool getEmitCompactUnwindNonCanonical();
+LLVM_ABI bool getEmitSFrameUnwind();
+
LLVM_ABI bool getShowMCInst();
LLVM_ABI bool getFatalWarnings();
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index a3aa0d9..ced1afd 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -1479,6 +1479,7 @@ template <class ELFT> Triple::OSType ELFObjectFile<ELFT>::getOS() const {
case ELF::ELFOSABI_OPENBSD:
return Triple::OpenBSD;
case ELF::ELFOSABI_CUDA:
+ case ELF::ELFOSABI_CUDA_V2:
return Triple::CUDA;
case ELF::ELFOSABI_AMDGPU_HSA:
return Triple::AMDHSA;
diff --git a/llvm/include/llvm/Object/SFrameParser.h b/llvm/include/llvm/Object/SFrameParser.h
new file mode 100644
index 0000000..cf4fe20
--- /dev/null
+++ b/llvm/include/llvm/Object/SFrameParser.h
@@ -0,0 +1,48 @@
+//===- SFrameParser.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECT_SFRAME_H
+#define LLVM_OBJECT_SFRAME_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/BinaryFormat/SFrame.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+
+namespace llvm {
+namespace object {
+
+template <endianness E> class SFrameParser {
+public:
+ static Expected<SFrameParser> create(ArrayRef<uint8_t> Contents);
+
+ const sframe::Preamble<E> &getPreamble() const { return Header.Preamble; }
+ const sframe::Header<E> &getHeader() const { return Header; }
+
+ bool usesFixedRAOffset() const {
+ return getHeader().ABIArch == sframe::ABI::AMD64EndianLittle;
+ }
+ bool usesFixedFPOffset() const {
+ return false; // Not used in any currently defined ABI.
+ }
+
+private:
+ ArrayRef<uint8_t> Data;
+ const sframe::Header<E> &Header;
+
+ SFrameParser(ArrayRef<uint8_t> Data, const sframe::Header<E> &Header)
+ : Data(Data), Header(Header) {}
+};
+
+extern template class SFrameParser<endianness::big>;
+extern template class SFrameParser<endianness::little>;
+
+} // end namespace object
+} // end namespace llvm
+
+#endif // LLVM_OBJECT_SFRAME_H
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 732fdc7..bee2106 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -113,6 +113,7 @@ MACHINE_FUNCTION_PASS("early-machinelicm", EarlyMachineLICMPass())
MACHINE_FUNCTION_PASS("early-tailduplication", EarlyTailDuplicatePass())
MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass())
MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass())
+MACHINE_FUNCTION_PASS("finalizebundle-test", FinalizeBundleTestPass())
MACHINE_FUNCTION_PASS("fixup-statepoint-caller-saved", FixupStatepointCallerSavedPass())
MACHINE_FUNCTION_PASS("init-undef", InitUndefPass())
MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass())
diff --git a/llvm/include/llvm/Support/AArch64AttributeParser.h b/llvm/include/llvm/Support/AArch64AttributeParser.h
index aa82ca1..796dbfd 100644
--- a/llvm/include/llvm/Support/AArch64AttributeParser.h
+++ b/llvm/include/llvm/Support/AArch64AttributeParser.h
@@ -25,6 +25,17 @@ public:
: ELFExtendedAttrParser(nullptr, returnTagsNamesMap()) {}
};
+// Used for extracting AArch64 Build Attributes
+struct AArch64BuildAttrSubsections {
+ struct PauthSubSection {
+ uint64_t TagPlatform = 0;
+ uint64_t TagSchema = 0;
+ } Pauth;
+ uint32_t AndFeatures = 0;
+};
+
+AArch64BuildAttrSubsections
+extractBuildAttributesSubsections(const llvm::AArch64AttributeParser &);
} // namespace llvm
#endif // LLVM_SUPPORT_AARCH64ATTRIBUTEPARSER_H
diff --git a/llvm/include/llvm/Support/AlwaysTrue.h b/llvm/include/llvm/Support/AlwaysTrue.h
new file mode 100644
index 0000000..b696856
--- /dev/null
+++ b/llvm/include/llvm/Support/AlwaysTrue.h
@@ -0,0 +1,25 @@
+//===--- AlwaysTrue.h - Helper for oqaque truthy values --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ALWAYS_TRUE_H
+#define LLVM_SUPPORT_ALWAYS_TRUE_H
+
+#include <cstdlib>
+
+namespace llvm {
+inline bool getNonFoldableAlwaysTrue() {
+ // Some parts of the codebase require a "constant true value" used as a
+ // predicate. These cases require that even with LTO and static linking,
+ // it's not possible for the compiler to fold the value. As compilers
+ // aren't smart enough to know that getenv() never returns -1, this will do
+ // the job.
+ return std::getenv("LLVM_IGNORED_ENV_VAR") != (char *)-1;
+}
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_ALWAYS_TRUE_H
diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
index adaa75c..ca725b8 100644
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -1518,11 +1518,18 @@ public:
[](const typename ParserClass::parser_data_type &) {};
};
-extern template class opt<unsigned>;
-extern template class opt<int>;
-extern template class opt<std::string>;
-extern template class opt<char>;
-extern template class opt<bool>;
+#if !(defined(LLVM_ENABLE_LLVM_EXPORT_ANNOTATIONS) && defined(_MSC_VER))
+// Only instantiate opt<std::string> when not building a Windows DLL. When
+// exporting opt<std::string>, MSVC implicitly exports symbols for
+// std::basic_string through transitive inheritance via std::string. These
+// symbols may appear in clients, leading to duplicate symbol conflicts.
+extern template class LLVM_TEMPLATE_ABI opt<std::string>;
+#endif
+
+extern template class LLVM_TEMPLATE_ABI opt<unsigned>;
+extern template class LLVM_TEMPLATE_ABI opt<int>;
+extern template class LLVM_TEMPLATE_ABI opt<char>;
+extern template class LLVM_TEMPLATE_ABI opt<bool>;
//===----------------------------------------------------------------------===//
// Default storage class definition: external storage. This implementation
diff --git a/llvm/include/llvm/Support/DebugLog.h b/llvm/include/llvm/Support/DebugLog.h
new file mode 100644
index 0000000..9556bf2
--- /dev/null
+++ b/llvm/include/llvm/Support/DebugLog.h
@@ -0,0 +1,68 @@
+//===- llvm/Support/DebugLog.h - Logging like debug output ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file contains macros for logging like debug output. It builds upon the
+// support in Debug.h but provides a utility function for common debug output
+// style.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_DEBUGLOG_H
+#define LLVM_SUPPORT_DEBUGLOG_H
+
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+#ifndef NDEBUG
+
+// Output with given inputs and trailing newline. E.g.,
+// LDBG() << "Bitset contains: " << Bitset;
+// is equivalent to
+// LLVM_DEBUG(dbgs() << DEBUG_TYPE << " [" << __FILE__ << ":" << __LINE__
+// << "] " << "Bitset contains: " << Bitset << "\n");
+#define LDBG() DEBUGLOG_WITH_STREAM_AND_TYPE(llvm::dbgs(), DEBUG_TYPE)
+
+#define DEBUGLOG_WITH_STREAM_AND_TYPE(STREAM, TYPE) \
+ for (bool _c = (::llvm::DebugFlag && ::llvm::isCurrentDebugType(TYPE)); _c; \
+ _c = false) \
+ ::llvm::impl::LogWithNewline(TYPE, __FILE__, __LINE__, (STREAM))
+
+namespace impl {
+class LogWithNewline {
+public:
+ LogWithNewline(const char *debug_type, const char *file, int line,
+ raw_ostream &os)
+ : os(os) {
+ if (debug_type)
+ os << debug_type << " ";
+ os << "[" << file << ":" << line << "] ";
+ }
+ ~LogWithNewline() { os << '\n'; }
+ template <typename T> raw_ostream &operator<<(const T &t) && {
+ return os << t;
+ }
+
+ // Prevent copying, as this class manages newline responsibility and is
+ // intended for use as a temporary.
+ LogWithNewline(const LogWithNewline &) = delete;
+ LogWithNewline &operator=(const LogWithNewline &) = delete;
+ LogWithNewline &operator=(LogWithNewline &&) = delete;
+
+private:
+ raw_ostream &os;
+};
+} // end namespace impl
+#else
+// As others in Debug, When compiling without assertions, the -debug-* options
+// and all inputs too LDBG() are ignored.
+#define LDBG() \
+ for (bool _c = false; _c; _c = false) \
+ ::llvm::nulls()
+#endif
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_DEBUGLOG_H
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 59e8117..8e83b046 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -276,14 +276,14 @@ LLVM_ABI bool isX18ReservedByDefault(const Triple &TT);
// For a given set of feature names, which can be either target-features, or
// fmv-features metadata, expand their dependencies and then return a bitmask
// corresponding to the entries of AArch64::FeatPriorities.
-LLVM_ABI uint64_t getFMVPriority(ArrayRef<StringRef> Features);
+LLVM_ABI APInt getFMVPriority(ArrayRef<StringRef> Features);
// For a given set of FMV feature names, expand their dependencies and then
// return a bitmask corresponding to the entries of AArch64::CPUFeatures.
// The values in CPUFeatures are not bitmasks themselves, they are sequential
// (0, 1, 2, 3, ...). The resulting bitmask is used at runtime to test whether
// a certain FMV feature is available on the host.
-LLVM_ABI uint64_t getCpuSupportsMask(ArrayRef<StringRef> Features);
+LLVM_ABI APInt getCpuSupportsMask(ArrayRef<StringRef> Features);
LLVM_ABI void PrintSupportedExtensions();
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index bb79d25..3f5f427 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -325,7 +325,6 @@ LLVM_ABI void salvageDebugInfo(Instruction &I);
/// Mark undef if salvaging cannot be completed.
LLVM_ABI void
salvageDebugInfoForDbgValues(Instruction &I,
- ArrayRef<DbgVariableIntrinsic *> Insns,
ArrayRef<DbgVariableRecord *> DPInsns);
/// Given an instruction \p I and DIExpression \p DIExpr operating on
diff --git a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
index f288bdf..e0cdcf8 100644
--- a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
+++ b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
@@ -57,7 +57,6 @@ struct AllocaInfo {
struct StackInfo {
MapVector<AllocaInst *, AllocaInfo> AllocasToInstrument;
- SmallVector<Instruction *, 4> UnrecognizedLifetimes;
SmallVector<Instruction *, 8> RetVec;
bool CallsReturnTwice = false;
};
diff --git a/llvm/include/llvm/Transforms/Utils/ProfileVerify.h b/llvm/include/llvm/Transforms/Utils/ProfileVerify.h
new file mode 100644
index 0000000..7834305
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/ProfileVerify.h
@@ -0,0 +1,36 @@
+//===- ProfileVerify.h - Verify profile info for testing ----------*-C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Inject profile information, as part of tests, to verify passes don't
+// accidentally drop it.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_UTILS_PROFILEVERIFY_H
+#define LLVM_TRANSFORMS_UTILS_PROFILEVERIFY_H
+
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+/// Inject MD_prof metadata where it's missing. Used for testing that passes
+/// don't accidentally drop this metadata.
+class ProfileInjectorPass : public PassInfoMixin<ProfileInjectorPass> {
+public:
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+/// Checks that MD_prof is present on every instruction that supports it. Used
+/// in conjunction with the ProfileInjectorPass. MD_prof "unknown" is considered
+/// valid (i.e. !{!"unknown"})
+class ProfileVerifierPass : public PassInfoMixin<ProfileVerifierPass> {
+public:
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+} // namespace llvm
+#endif
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 9c1c2c6..ec78386 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1801,6 +1801,44 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
case Intrinsic::nvvm_d2ull_rn:
case Intrinsic::nvvm_d2ull_rp:
case Intrinsic::nvvm_d2ull_rz:
+
+ // NVVM math intrinsics:
+ case Intrinsic::nvvm_ceil_d:
+ case Intrinsic::nvvm_ceil_f:
+ case Intrinsic::nvvm_ceil_ftz_f:
+
+ case Intrinsic::nvvm_fabs:
+ case Intrinsic::nvvm_fabs_ftz:
+
+ case Intrinsic::nvvm_floor_d:
+ case Intrinsic::nvvm_floor_f:
+ case Intrinsic::nvvm_floor_ftz_f:
+
+ case Intrinsic::nvvm_rcp_rm_d:
+ case Intrinsic::nvvm_rcp_rm_f:
+ case Intrinsic::nvvm_rcp_rm_ftz_f:
+ case Intrinsic::nvvm_rcp_rn_d:
+ case Intrinsic::nvvm_rcp_rn_f:
+ case Intrinsic::nvvm_rcp_rn_ftz_f:
+ case Intrinsic::nvvm_rcp_rp_d:
+ case Intrinsic::nvvm_rcp_rp_f:
+ case Intrinsic::nvvm_rcp_rp_ftz_f:
+ case Intrinsic::nvvm_rcp_rz_d:
+ case Intrinsic::nvvm_rcp_rz_f:
+ case Intrinsic::nvvm_rcp_rz_ftz_f:
+
+ case Intrinsic::nvvm_round_d:
+ case Intrinsic::nvvm_round_f:
+ case Intrinsic::nvvm_round_ftz_f:
+
+ case Intrinsic::nvvm_saturate_d:
+ case Intrinsic::nvvm_saturate_f:
+ case Intrinsic::nvvm_saturate_ftz_f:
+
+ case Intrinsic::nvvm_sqrt_f:
+ case Intrinsic::nvvm_sqrt_rn_d:
+ case Intrinsic::nvvm_sqrt_rn_f:
+ case Intrinsic::nvvm_sqrt_rn_ftz_f:
return !Call->isStrictFP();
// Sign operations are actually bitwise operations, they do not raise
@@ -1818,6 +1856,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
case Intrinsic::nearbyint:
case Intrinsic::rint:
case Intrinsic::canonicalize:
+
// Constrained intrinsics can be folded if FP environment is known
// to compiler.
case Intrinsic::experimental_constrained_fma:
@@ -1971,16 +2010,49 @@ static APFloat FTZPreserveSign(const APFloat &V) {
return V;
}
-Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V,
- Type *Ty) {
+static APFloat FlushToPositiveZero(const APFloat &V) {
+ if (V.isDenormal())
+ return APFloat::getZero(V.getSemantics(), false);
+ return V;
+}
+
+static APFloat FlushWithDenormKind(const APFloat &V,
+ DenormalMode::DenormalModeKind DenormKind) {
+ assert(DenormKind != DenormalMode::DenormalModeKind::Invalid &&
+ DenormKind != DenormalMode::DenormalModeKind::Dynamic);
+ switch (DenormKind) {
+ case DenormalMode::DenormalModeKind::IEEE:
+ return V;
+ case DenormalMode::DenormalModeKind::PreserveSign:
+ return FTZPreserveSign(V);
+ case DenormalMode::DenormalModeKind::PositiveZero:
+ return FlushToPositiveZero(V);
+ default:
+ llvm_unreachable("Invalid denormal mode!");
+ }
+}
+
+Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V, Type *Ty,
+ DenormalMode DenormMode = DenormalMode::getIEEE()) {
+ if (!DenormMode.isValid() ||
+ DenormMode.Input == DenormalMode::DenormalModeKind::Dynamic ||
+ DenormMode.Output == DenormalMode::DenormalModeKind::Dynamic)
+ return nullptr;
+
llvm_fenv_clearexcept();
- double Result = NativeFP(V.convertToDouble());
+ auto Input = FlushWithDenormKind(V, DenormMode.Input);
+ double Result = NativeFP(Input.convertToDouble());
if (llvm_fenv_testexcept()) {
llvm_fenv_clearexcept();
return nullptr;
}
- return GetConstantFoldFPValue(Result, Ty);
+ Constant *Output = GetConstantFoldFPValue(Result, Ty);
+ if (DenormMode.Output == DenormalMode::DenormalModeKind::IEEE)
+ return Output;
+ const auto *CFP = static_cast<ConstantFP *>(Output);
+ const auto Res = FlushWithDenormKind(CFP->getValueAPF(), DenormMode.Output);
+ return ConstantFP::get(Ty->getContext(), Res);
}
#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128)
@@ -2550,6 +2622,94 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
return ConstantFoldFP(atan, APF, Ty);
case Intrinsic::sqrt:
return ConstantFoldFP(sqrt, APF, Ty);
+
+ // NVVM Intrinsics:
+ case Intrinsic::nvvm_ceil_ftz_f:
+ case Intrinsic::nvvm_ceil_f:
+ case Intrinsic::nvvm_ceil_d:
+ return ConstantFoldFP(
+ ceil, APF, Ty,
+ nvvm::GetNVVMDenromMode(
+ nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
+
+ case Intrinsic::nvvm_fabs_ftz:
+ case Intrinsic::nvvm_fabs:
+ return ConstantFoldFP(
+ fabs, APF, Ty,
+ nvvm::GetNVVMDenromMode(
+ nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
+
+ case Intrinsic::nvvm_floor_ftz_f:
+ case Intrinsic::nvvm_floor_f:
+ case Intrinsic::nvvm_floor_d:
+ return ConstantFoldFP(
+ floor, APF, Ty,
+ nvvm::GetNVVMDenromMode(
+ nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
+
+ case Intrinsic::nvvm_rcp_rm_ftz_f:
+ case Intrinsic::nvvm_rcp_rn_ftz_f:
+ case Intrinsic::nvvm_rcp_rp_ftz_f:
+ case Intrinsic::nvvm_rcp_rz_ftz_f:
+ case Intrinsic::nvvm_rcp_rm_d:
+ case Intrinsic::nvvm_rcp_rm_f:
+ case Intrinsic::nvvm_rcp_rn_d:
+ case Intrinsic::nvvm_rcp_rn_f:
+ case Intrinsic::nvvm_rcp_rp_d:
+ case Intrinsic::nvvm_rcp_rp_f:
+ case Intrinsic::nvvm_rcp_rz_d:
+ case Intrinsic::nvvm_rcp_rz_f: {
+ APFloat::roundingMode RoundMode = nvvm::GetRCPRoundingMode(IntrinsicID);
+ bool IsFTZ = nvvm::RCPShouldFTZ(IntrinsicID);
+
+ auto Denominator = IsFTZ ? FTZPreserveSign(APF) : APF;
+ APFloat Res = APFloat::getOne(APF.getSemantics());
+ APFloat::opStatus Status = Res.divide(Denominator, RoundMode);
+
+ if (Status == APFloat::opOK || Status == APFloat::opInexact) {
+ if (IsFTZ)
+ Res = FTZPreserveSign(Res);
+ return ConstantFP::get(Ty->getContext(), Res);
+ }
+ return nullptr;
+ }
+
+ case Intrinsic::nvvm_round_ftz_f:
+ case Intrinsic::nvvm_round_f:
+ case Intrinsic::nvvm_round_d: {
+ // Use APFloat implementation instead of native libm call, as some
+ // implementations (e.g. on PPC) do not preserve the sign of negative 0.
+ bool IsFTZ = nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID);
+ auto V = IsFTZ ? FTZPreserveSign(APF) : APF;
+ V.roundToIntegral(APFloat::rmNearestTiesToAway);
+ return ConstantFP::get(Ty->getContext(), V);
+ }
+
+ case Intrinsic::nvvm_saturate_ftz_f:
+ case Intrinsic::nvvm_saturate_d:
+ case Intrinsic::nvvm_saturate_f: {
+ bool IsFTZ = nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID);
+ auto V = IsFTZ ? FTZPreserveSign(APF) : APF;
+ if (V.isNegative() || V.isZero() || V.isNaN())
+ return ConstantFP::getZero(Ty);
+ APFloat One = APFloat::getOne(APF.getSemantics());
+ if (V > One)
+ return ConstantFP::get(Ty->getContext(), One);
+ return ConstantFP::get(Ty->getContext(), APF);
+ }
+
+ case Intrinsic::nvvm_sqrt_rn_ftz_f:
+ case Intrinsic::nvvm_sqrt_f:
+ case Intrinsic::nvvm_sqrt_rn_d:
+ case Intrinsic::nvvm_sqrt_rn_f:
+ if (APF.isNegative())
+ return nullptr;
+ return ConstantFoldFP(
+ sqrt, APF, Ty,
+ nvvm::GetNVVMDenromMode(
+ nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
+
+ // AMDGCN Intrinsics:
case Intrinsic::amdgcn_cos:
case Intrinsic::amdgcn_sin: {
double V = getValueAsDouble(Op);
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index f3a32d3..14be385 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -589,11 +589,11 @@ void RuntimePointerChecking::groupChecks(
// dependence. Not grouping the checks for a[i] and a[i + 9000] allows
// us to perform an accurate check in this case.
//
- // The above case requires that we have an UnknownDependence between
- // accesses to the same underlying object. This cannot happen unless
- // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
- // is also false. In this case we will use the fallback path and create
- // separate checking groups for all pointers.
+ // In the above case, we have a non-constant distance and an Unknown
+ // dependence between accesses to the same underlying object, and could retry
+ // with runtime checks. Therefore UseDependencies is false. In this case we
+ // will use the fallback path and create separate checking groups for all
+ // pointers.
// If we don't have the dependency partitions, construct a new
// checking pointer group for each pointer. This is also required
@@ -819,7 +819,7 @@ public:
/// perform dependency checking.
///
/// Note that this can later be cleared if we retry memcheck analysis without
- /// dependency checking (i.e. FoundNonConstantDistanceDependence).
+ /// dependency checking (i.e. ShouldRetryWithRuntimeChecks).
bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); }
/// We decided that no dependence analysis would be used. Reset the state.
@@ -896,7 +896,7 @@ private:
///
/// Note that, this is different from isDependencyCheckNeeded. When we retry
/// memcheck analysis without dependency checking
- /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
+ /// (i.e. ShouldRetryWithRuntimeChecks), isDependencyCheckNeeded is
/// cleared while this remains set if we have potentially dependent accesses.
bool IsRTCheckAnalysisNeeded = false;
@@ -2079,11 +2079,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
if (StrideAScaled == StrideBScaled)
CommonStride = StrideAScaled;
- // TODO: FoundNonConstantDistanceDependence is used as a necessary condition
- // to consider retrying with runtime checks. Historically, we did not set it
- // when (unscaled) strides were different but there is no inherent reason to.
+ // TODO: Historically, we didn't retry with runtime checks when (unscaled)
+ // strides were different but there is no inherent reason to.
if (!isa<SCEVConstant>(Dist))
- FoundNonConstantDistanceDependence |= StrideAPtrInt == StrideBPtrInt;
+ ShouldRetryWithRuntimeChecks |= StrideAPtrInt == StrideBPtrInt;
// If distance is a SCEVCouldNotCompute, return Unknown immediately.
if (isa<SCEVCouldNotCompute>(Dist)) {
@@ -2712,7 +2711,7 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
DepsAreSafe =
DepChecker->areDepsSafe(DepCands, Accesses.getDependenciesToCheck());
- if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeCheck()) {
+ if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeChecks()) {
LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
// Clear the dependency checks. We assume they are not needed.
diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index e8d4e37..f1c3155 100644
--- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -121,8 +121,18 @@ void ProfileSummaryInfo::computeThresholds() {
ProfileSummaryBuilder::getHotCountThreshold(DetailedSummary);
ColdCountThreshold =
ProfileSummaryBuilder::getColdCountThreshold(DetailedSummary);
- assert(ColdCountThreshold <= HotCountThreshold &&
- "Cold count threshold cannot exceed hot count threshold!");
+ // When the hot and cold thresholds are identical, we would classify
+ // a count value as both hot and cold since we are doing an inclusive check
+ // (see ::is{Hot|Cold}Count(). To avoid this undesirable overlap, ensure the
+ // thresholds are distinct.
+ if (HotCountThreshold == ColdCountThreshold) {
+ if (ColdCountThreshold > 0)
+ (*ColdCountThreshold)--;
+ else
+ (*HotCountThreshold)++;
+ }
+ assert(ColdCountThreshold < HotCountThreshold &&
+ "Cold count threshold should be less than hot count threshold!");
if (!hasPartialSampleProfile() || !ScalePartialSampleProfileWorkingSetSize) {
HasHugeWorkingSetSize =
HotEntry.NumCounts > ProfileSummaryHugeWorkingSetSizeThreshold;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 24adfa3..0990a0d 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -11418,8 +11418,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(CmpPredicate Pred,
XNonConstOp = X;
XFlagsPresent = ExpectedFlags;
}
- if (!isa<SCEVConstant>(XConstOp) ||
- (XFlagsPresent & ExpectedFlags) != ExpectedFlags)
+ if (!isa<SCEVConstant>(XConstOp))
return false;
if (!splitBinaryAdd(Y, YConstOp, YNonConstOp, YFlagsPresent)) {
@@ -11428,12 +11427,20 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(CmpPredicate Pred,
YFlagsPresent = ExpectedFlags;
}
- if (!isa<SCEVConstant>(YConstOp) ||
- (YFlagsPresent & ExpectedFlags) != ExpectedFlags)
+ if (YNonConstOp != XNonConstOp)
return false;
- if (YNonConstOp != XNonConstOp)
+ if (!isa<SCEVConstant>(YConstOp))
+ return false;
+
+ // When matching ADDs with NUW flags (and unsigned predicates), only the
+ // second ADD (with the larger constant) requires NUW.
+ if ((YFlagsPresent & ExpectedFlags) != ExpectedFlags)
+ return false;
+ if (ExpectedFlags != SCEV::FlagNUW &&
+ (XFlagsPresent & ExpectedFlags) != ExpectedFlags) {
return false;
+ }
OutC1 = cast<SCEVConstant>(XConstOp)->getAPInt();
OutC2 = cast<SCEVConstant>(YConstOp)->getAPInt();
@@ -11472,7 +11479,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(CmpPredicate Pred,
std::swap(LHS, RHS);
[[fallthrough]];
case ICmpInst::ICMP_ULE:
- // (X + C1)<nuw> u<= (X + C2)<nuw> for C1 u<= C2.
+ // (X + C1) u<= (X + C2)<nuw> for C1 u<= C2.
if (MatchBinaryAddToConst(LHS, RHS, C1, C2, SCEV::FlagNUW) && C1.ule(C2))
return true;
@@ -11482,7 +11489,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(CmpPredicate Pred,
std::swap(LHS, RHS);
[[fallthrough]];
case ICmpInst::ICMP_ULT:
- // (X + C1)<nuw> u< (X + C2)<nuw> if C1 u< C2.
+ // (X + C1) u< (X + C2)<nuw> if C1 u< C2.
if (MatchBinaryAddToConst(LHS, RHS, C1, C2, SCEV::FlagNUW) && C1.ult(C2))
return true;
break;
diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp
index 21f54c7..34a7a04 100644
--- a/llvm/lib/Analysis/StackLifetime.cpp
+++ b/llvm/lib/Analysis/StackLifetime.cpp
@@ -63,10 +63,7 @@ bool StackLifetime::isAliveAfter(const AllocaInst *AI,
// markers has the same size and points to the alloca start.
static const AllocaInst *findMatchingAlloca(const IntrinsicInst &II,
const DataLayout &DL) {
- const AllocaInst *AI = findAllocaForValue(II.getArgOperand(1), true);
- if (!AI)
- return nullptr;
-
+ const AllocaInst *AI = cast<AllocaInst>(II.getArgOperand(1));
auto AllocaSize = AI->getAllocationSize(DL);
if (!AllocaSize)
return nullptr;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 8a470eb..55ba52a 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1423,7 +1423,7 @@ bool TargetTransformInfo::hasArmWideBranch(bool Thumb) const {
return TTIImpl->hasArmWideBranch(Thumb);
}
-uint64_t TargetTransformInfo::getFeatureMask(const Function &F) const {
+APInt TargetTransformInfo::getFeatureMask(const Function &F) const {
return TTIImpl->getFeatureMask(F);
}
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 61a322b..af85ce4 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7912,6 +7912,8 @@ bool llvm::intrinsicPropagatesPoison(Intrinsic::ID IID) {
case Intrinsic::ushl_sat:
case Intrinsic::smul_fix:
case Intrinsic::smul_fix_sat:
+ case Intrinsic::umul_fix:
+ case Intrinsic::umul_fix_sat:
case Intrinsic::pow:
case Intrinsic::powi:
case Intrinsic::sin:
@@ -7928,6 +7930,22 @@ bool llvm::intrinsicPropagatesPoison(Intrinsic::ID IID) {
case Intrinsic::atan2:
case Intrinsic::canonicalize:
case Intrinsic::sqrt:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::exp10:
+ case Intrinsic::log:
+ case Intrinsic::log2:
+ case Intrinsic::log10:
+ case Intrinsic::modf:
+ case Intrinsic::floor:
+ case Intrinsic::ceil:
+ case Intrinsic::trunc:
+ case Intrinsic::rint:
+ case Intrinsic::nearbyint:
+ case Intrinsic::round:
+ case Intrinsic::roundeven:
+ case Intrinsic::lrint:
+ case Intrinsic::llrint:
return true;
default:
return false;
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index ce813e1..520c6a0 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -679,6 +679,7 @@ lltok::Kind LLLexer::LexIdentifier() {
KEYWORD(amdgpu_cs_chain_preserve);
KEYWORD(amdgpu_kernel);
KEYWORD(amdgpu_gfx);
+ KEYWORD(amdgpu_gfx_whole_wave);
KEYWORD(tailcc);
KEYWORD(m68k_rtdcc);
KEYWORD(graalcc);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index b7f6950..13bef1f 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -2272,6 +2272,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
CC = CallingConv::AMDGPU_CS_ChainPreserve;
break;
case lltok::kw_amdgpu_kernel: CC = CallingConv::AMDGPU_KERNEL; break;
+ case lltok::kw_amdgpu_gfx_whole_wave:
+ CC = CallingConv::AMDGPU_Gfx_WholeWave;
+ break;
case lltok::kw_tailcc: CC = CallingConv::Tail; break;
case lltok::kw_m68k_rtdcc: CC = CallingConv::M68k_RTD; break;
case lltok::kw_graalcc: CC = CallingConv::GRAAL; break;
@@ -4783,9 +4786,13 @@ struct MDField : public MDFieldImpl<Metadata *> {
};
struct MDStringField : public MDFieldImpl<MDString *> {
- bool AllowEmpty;
- MDStringField(bool AllowEmpty = true)
- : ImplTy(nullptr), AllowEmpty(AllowEmpty) {}
+ enum class EmptyIs {
+ Null, //< Allow empty input string, map to nullptr
+ Empty, //< Allow empty input string, map to an empty MDString
+ Error, //< Disallow empty string, map to an error
+ } EmptyIs;
+ MDStringField(enum EmptyIs EmptyIs = EmptyIs::Null)
+ : ImplTy(nullptr), EmptyIs(EmptyIs) {}
};
struct MDFieldList : public MDFieldImpl<SmallVector<Metadata *, 4>> {
@@ -5257,10 +5264,19 @@ bool LLParser::parseMDField(LocTy Loc, StringRef Name, MDStringField &Result) {
if (parseStringConstant(S))
return true;
- if (!Result.AllowEmpty && S.empty())
- return error(ValueLoc, "'" + Name + "' cannot be empty");
+ if (S.empty()) {
+ switch (Result.EmptyIs) {
+ case MDStringField::EmptyIs::Null:
+ Result.assign(nullptr);
+ return false;
+ case MDStringField::EmptyIs::Empty:
+ break;
+ case MDStringField::EmptyIs::Error:
+ return error(ValueLoc, "'" + Name + "' cannot be empty");
+ }
+ }
- Result.assign(S.empty() ? nullptr : MDString::get(Context, S));
+ Result.assign(MDString::get(Context, S));
return false;
}
@@ -5778,7 +5794,7 @@ bool LLParser::parseDIFile(MDNode *&Result, bool IsDistinct) {
REQUIRED(directory, MDStringField, ); \
OPTIONAL(checksumkind, ChecksumKindField, (DIFile::CSK_MD5)); \
OPTIONAL(checksum, MDStringField, ); \
- OPTIONAL(source, MDStringField, );
+ OPTIONAL(source, MDStringField, (MDStringField::EmptyIs::Empty));
PARSE_MD_FIELDS();
#undef VISIT_MD_FIELDS
@@ -6062,7 +6078,7 @@ bool LLParser::parseDITemplateValueParameter(MDNode *&Result, bool IsDistinct) {
/// declaration: !4, align: 8)
bool LLParser::parseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED) \
- OPTIONAL(name, MDStringField, (/* AllowEmpty */ false)); \
+ OPTIONAL(name, MDStringField, (MDStringField::EmptyIs::Error)); \
OPTIONAL(scope, MDField, ); \
OPTIONAL(linkageName, MDStringField, ); \
OPTIONAL(file, MDField, ); \
diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt
index 38ba2d9..4b2debb 100644
--- a/llvm/lib/BinaryFormat/CMakeLists.txt
+++ b/llvm/lib/BinaryFormat/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMBinaryFormat
MsgPackDocumentYAML.cpp
MsgPackReader.cpp
MsgPackWriter.cpp
+ SFrame.cpp
Wasm.cpp
XCOFF.cpp
diff --git a/llvm/lib/BinaryFormat/SFrame.cpp b/llvm/lib/BinaryFormat/SFrame.cpp
new file mode 100644
index 0000000..3b436af
--- /dev/null
+++ b/llvm/lib/BinaryFormat/SFrame.cpp
@@ -0,0 +1,37 @@
+//===-- SFrame.cpp -----------------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/SFrame.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+using namespace llvm;
+
+ArrayRef<EnumEntry<sframe::Version>> sframe::getVersions() {
+ static constexpr EnumEntry<Version> Versions[] = {
+#define HANDLE_SFRAME_VERSION(CODE, NAME) {#NAME, sframe::Version::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+
+ return ArrayRef(Versions);
+}
+
+ArrayRef<EnumEntry<sframe::Flags>> sframe::getFlags() {
+ static constexpr EnumEntry<sframe::Flags> Flags[] = {
+#define HANDLE_SFRAME_FLAG(CODE, NAME) {#NAME, sframe::Flags::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+ return ArrayRef(Flags);
+}
+
+ArrayRef<EnumEntry<sframe::ABI>> sframe::getABIs() {
+ static constexpr EnumEntry<sframe::ABI> ABIs[] = {
+#define HANDLE_SFRAME_ABI(CODE, NAME) {#NAME, sframe::ABI::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+ };
+ return ArrayRef(ABIs);
+}
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 66ecc69..290d873 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -293,10 +293,18 @@ static Expected<bool> hasObjCCategoryInModule(BitstreamCursor &Stream) {
std::string S;
if (convertToString(Record, 0, S))
return error("Invalid section name record");
+
// Check for the i386 and other (x86_64, ARM) conventions
- if (S.find("__DATA,__objc_catlist") != std::string::npos ||
- S.find("__OBJC,__category") != std::string::npos ||
- S.find("__TEXT,__swift") != std::string::npos)
+
+ auto [Segment, Section] = StringRef(S).split(",");
+ Segment = Segment.trim();
+ Section = Section.trim();
+
+ if (Segment == "__DATA" && Section.starts_with("__objc_catlist"))
+ return true;
+ if (Segment == "__OBJC" && Section.starts_with("__category"))
+ return true;
+ if (Segment == "__TEXT" && Section.starts_with("__swift"))
return true;
break;
}
@@ -7007,13 +7015,6 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
if (StripDebugInfo)
stripDebugInfo(*F);
- // Upgrade any old intrinsic calls in the function.
- for (auto &I : UpgradedIntrinsics) {
- for (User *U : llvm::make_early_inc_range(I.first->materialized_users()))
- if (CallInst *CI = dyn_cast<CallInst>(U))
- UpgradeIntrinsicCall(CI, I.second);
- }
-
// Finish fn->subprogram upgrade for materialized functions.
if (DISubprogram *SP = MDLoader->lookupSubprogramForFunction(F))
F->setSubprogram(SP);
@@ -7029,7 +7030,7 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
}
}
- for (auto &I : instructions(F)) {
+ for (auto &I : make_early_inc_range(instructions(F))) {
// "Upgrade" older incorrect branch weights by dropping them.
if (auto *MD = I.getMetadata(LLVMContext::MD_prof)) {
if (MD->getOperand(0) != nullptr && isa<MDString>(MD->getOperand(0))) {
@@ -7060,8 +7061,8 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
}
}
- // Remove incompatible attributes on function calls.
if (auto *CI = dyn_cast<CallBase>(&I)) {
+ // Remove incompatible attributes on function calls.
CI->removeRetAttrs(AttributeFuncs::typeIncompatible(
CI->getFunctionType()->getReturnType(), CI->getRetAttributes()));
@@ -7069,6 +7070,13 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
CI->removeParamAttrs(ArgNo, AttributeFuncs::typeIncompatible(
CI->getArgOperand(ArgNo)->getType(),
CI->getParamAttributes(ArgNo)));
+
+ // Upgrade intrinsics.
+ if (Function *OldFn = CI->getCalledFunction()) {
+ auto It = UpgradedIntrinsics.find(OldFn);
+ if (It != UpgradedIntrinsics.end())
+ UpgradeIntrinsicCall(CI, It->second);
+ }
}
}
@@ -7116,9 +7124,11 @@ Error BitcodeReader::materializeModule() {
if (CallInst *CI = dyn_cast<CallInst>(U))
UpgradeIntrinsicCall(CI, I.second);
}
- if (!I.first->use_empty())
- I.first->replaceAllUsesWith(I.second);
- I.first->eraseFromParent();
+ if (I.first != I.second) {
+ if (!I.first->use_empty())
+ I.first->replaceAllUsesWith(I.second);
+ I.first->eraseFromParent();
+ }
}
UpgradedIntrinsics.clear();
diff --git a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
index de6ebcf..51342c6 100644
--- a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -39,7 +39,7 @@ void ARMException::beginFunction(const MachineFunction *MF) {
if (CFISecType == AsmPrinter::CFISection::Debug) {
if (!hasEmittedCFISections) {
if (Asm->getModuleCFISectionType() == AsmPrinter::CFISection::Debug)
- Asm->OutStreamer->emitCFISections(false, true);
+ Asm->OutStreamer->emitCFISections(false, true, false);
hasEmittedCFISections = true;
}
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 76a1d8c..f1d3e96 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -809,7 +809,7 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
// If we have a bss global going to a section that supports the
// zerofill directive, do so here.
- if (GVKind.isBSS() && MAI->isMachO() && TheSection->isVirtualSection()) {
+ if (GVKind.isBSS() && MAI->isMachO() && TheSection->isBssSection()) {
if (Size == 0)
Size = 1; // zerofill of 0 bytes is undefined.
emitLinkage(GV, GVSym);
@@ -1868,6 +1868,7 @@ void AsmPrinter::emitFunctionBody() {
OutStreamer->emitLabel(MI.getOperand(0).getMCSymbol());
break;
case TargetOpcode::EH_LABEL:
+ OutStreamer->AddComment("EH_LABEL");
OutStreamer->emitLabel(MI.getOperand(0).getMCSymbol());
// For AsynchEH, insert a Nop if followed by a trap inst
// Or the exception won't be caught.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 4fac4bb..6b8d08c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -109,9 +109,11 @@ void DwarfCFIException::beginBasicBlockSection(const MachineBasicBlock &MBB) {
// chose not to be verbose in that case. And with `ForceDwarfFrameSection`,
// we should always emit .debug_frame.
if (CFISecType == AsmPrinter::CFISection::Debug ||
- Asm->TM.Options.ForceDwarfFrameSection)
+ Asm->TM.Options.ForceDwarfFrameSection ||
+ Asm->TM.Options.MCOptions.EmitSFrameUnwind)
Asm->OutStreamer->emitCFISections(
- CFISecType == AsmPrinter::CFISection::EH, true);
+ CFISecType == AsmPrinter::CFISection::EH, true,
+ Asm->TM.Options.MCOptions.EmitSFrameUnwind);
hasEmittedCFISections = true;
}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 8e8cda4..5577a7d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1379,7 +1379,7 @@ void DwarfCompileUnit::constructCallSiteParmEntryDIEs(
DIE *DwarfCompileUnit::constructImportedEntityDIE(
const DIImportedEntity *Module) {
- DIE *IMDie = DIE::get(DIEValueAllocator, (dwarf::Tag)Module->getTag());
+ DIE *IMDie = DIE::get(DIEValueAllocator, Module->getTag());
insertDIE(Module, IMDie);
DIE *EntityDie;
auto *Entity = Module->getEntity();
diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
index 618deef..4bf3bdf 100644
--- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
@@ -18,6 +18,11 @@
#include "llvm/MC/MCPseudoProbe.h"
#include "llvm/MC/MCStreamer.h"
+#ifndef NDEBUG
+#include "llvm/IR/Module.h"
+#include "llvm/Support/WithColor.h"
+#endif
+
using namespace llvm;
void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
@@ -35,6 +40,9 @@ void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
uint64_t &CallerGuid = NameGuidMap[Name];
if (!CallerGuid)
CallerGuid = Function::getGUIDAssumingExternalLinkage(Name);
+#ifndef NDEBUG
+ verifyGuidExistenceInDesc(CallerGuid, Name);
+#endif
uint64_t CallerProbeId = PseudoProbeDwarfDiscriminator::extractProbeIndex(
InlinedAt->getDiscriminator());
ReversedInlineStack.emplace_back(CallerGuid, CallerProbeId);
@@ -51,4 +59,28 @@ void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
SmallVector<InlineSite, 8> InlineStack(llvm::reverse(ReversedInlineStack));
Asm->OutStreamer->emitPseudoProbe(Guid, Index, Type, Attr, Discriminator,
InlineStack, Asm->CurrentFnSym);
+#ifndef NDEBUG
+ verifyGuidExistenceInDesc(
+ Guid, DebugLoc ? DebugLoc->getSubprogramLinkageName() : "");
+#endif
+}
+
+#ifndef NDEBUG
+void PseudoProbeHandler::verifyGuidExistenceInDesc(uint64_t Guid,
+ StringRef FuncName) {
+ NamedMDNode *Desc = Asm->MF->getFunction().getParent()->getNamedMetadata(
+ PseudoProbeDescMetadataName);
+ assert(Desc && "pseudo probe does not exist");
+
+ // Keep DescGuidSet up to date.
+ for (size_t I = DescGuidSet.size(), E = Desc->getNumOperands(); I != E; ++I) {
+ const auto *MD = cast<MDNode>(Desc->getOperand(I));
+ auto *ID = mdconst::extract<ConstantInt>(MD->getOperand(0));
+ DescGuidSet.insert(ID->getZExtValue());
+ }
+
+ if (!DescGuidSet.contains(Guid))
+ WithColor::warning() << "Guid:" << Guid << " Name:" << FuncName
+ << " does not exist in pseudo probe desc\n";
}
+#endif
diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
index f11b552..e950b23 100644
--- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
+++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
@@ -15,6 +15,10 @@
#include "llvm/ADT/DenseMap.h"
+#ifndef NDEBUG
+#include "llvm/ADT/DenseSet.h"
+#endif
+
namespace llvm {
class AsmPrinter;
@@ -26,6 +30,13 @@ class PseudoProbeHandler {
// Name to GUID map, used as caching/memoization for speed.
DenseMap<StringRef, uint64_t> NameGuidMap;
+#ifndef NDEBUG
+ // All GUID in llvm.pseudo_probe_desc.
+ DenseSet<uint64_t> DescGuidSet;
+
+ void verifyGuidExistenceInDesc(uint64_t Guid, StringRef FuncName);
+#endif
+
public:
PseudoProbeHandler(AsmPrinter *A) : Asm(A) {};
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index dccd71f..13fd270 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -323,12 +323,6 @@ const MCExpr *WinException::getLabel(const MCSymbol *Label) {
Asm->OutContext);
}
-const MCExpr *WinException::getLabelPlusOne(const MCSymbol *Label) {
- return MCBinaryExpr::createAdd(getLabel(Label),
- MCConstantExpr::create(1, Asm->OutContext),
- Asm->OutContext);
-}
-
const MCExpr *WinException::getOffset(const MCSymbol *OffsetOf,
const MCSymbol *OffsetFrom) {
return MCBinaryExpr::createSub(
@@ -655,7 +649,7 @@ void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
AddComment("LabelStart");
OS.emitValue(getLabel(BeginLabel), 4);
AddComment("LabelEnd");
- OS.emitValue(getLabelPlusOne(EndLabel), 4);
+ OS.emitValue(getLabel(EndLabel), 4);
AddComment(UME.IsFinally ? "FinallyFunclet" : UME.Filter ? "FilterFunction"
: "CatchAll");
OS.emitValue(FilterOrFinally, 4);
@@ -950,13 +944,7 @@ void WinException::computeIP2StateTable(
if (!ChangeLabel)
ChangeLabel = StateChange.PreviousEndLabel;
// Emit an entry indicating that PCs after 'Label' have this EH state.
- // NOTE: On ARM architectures, the StateFromIp automatically takes into
- // account that the return address is after the call instruction (whose EH
- // state we should be using), but on other platforms we need to +1 to the
- // label so that we are using the correct EH state.
- const MCExpr *LabelExpression = (isAArch64 || isThumb)
- ? getLabel(ChangeLabel)
- : getLabelPlusOne(ChangeLabel);
+ const MCExpr *LabelExpression = getLabel(ChangeLabel);
IPToStateTable.push_back(
std::make_pair(LabelExpression, StateChange.NewState));
// FIXME: assert that NewState is between CatchLow and CatchHigh.
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.h b/llvm/lib/CodeGen/AsmPrinter/WinException.h
index 638589a..47dd30c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.h
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.h
@@ -80,7 +80,6 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
const MCExpr *create32bitRef(const MCSymbol *Value);
const MCExpr *create32bitRef(const GlobalValue *GV);
const MCExpr *getLabel(const MCSymbol *Label);
- const MCExpr *getLabelPlusOne(const MCSymbol *Label);
const MCExpr *getOffset(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom);
const MCExpr *getOffsetPlusOne(const MCSymbol *OffsetOf,
const MCSymbol *OffsetFrom);
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index c3b4077..989cf4c4 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -45,7 +45,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeExpandPostRALegacyPass(Registry);
initializeFEntryInserterLegacyPass(Registry);
initializeFinalizeISelPass(Registry);
- initializeFinalizeMachineBundlesPass(Registry);
initializeFixupStatepointCallerSavedLegacyPass(Registry);
initializeFuncletLayoutPass(Registry);
initializeGCMachineCodeAnalysisPass(Registry);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index dc81843..c21058c 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -3571,9 +3571,7 @@ class TypePromotionTransaction {
}
// Record the debug uses separately. They are not in the instruction's
// use list, but they are replaced by RAUW.
- SmallVector<DbgValueInst *> DbgValues;
- findDbgValues(DbgValues, Inst, &DbgVariableRecords);
- assert(DbgValues.empty());
+ findDbgValues(Inst, DbgVariableRecords);
// Now, we can replace the uses.
Inst->replaceAllUsesWith(New);
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 714ec55..1c1047c 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -103,10 +103,10 @@ static void expandFPToI(Instruction *FPToI) {
Value *A1 = nullptr;
if (FloatVal->getType()->isHalfTy()) {
if (FPToI->getOpcode() == Instruction::FPToUI) {
- Value *A0 = Builder.CreateFPToUI(FloatVal, Builder.getIntNTy(32));
+ Value *A0 = Builder.CreateFPToUI(FloatVal, Builder.getInt32Ty());
A1 = Builder.CreateZExt(A0, IntTy);
} else { // FPToSI
- Value *A0 = Builder.CreateFPToSI(FloatVal, Builder.getIntNTy(32));
+ Value *A0 = Builder.CreateFPToSI(FloatVal, Builder.getInt32Ty());
A1 = Builder.CreateSExt(A0, IntTy);
}
FPToI->replaceAllUsesWith(A1);
@@ -425,8 +425,8 @@ static void expandIToFP(Instruction *IToFP) {
AAddr0->addIncoming(IsSigned ? Sub : IntVal, IfThen4);
AAddr0->addIncoming(Shl, SwBB);
Value *A0 = Builder.CreateTrunc(AAddr0, Builder.getInt32Ty());
- Value *A1 = Builder.CreateLShr(A0, Builder.getIntN(32, 2));
- Value *A2 = Builder.CreateAnd(A1, Builder.getIntN(32, 1));
+ Value *A1 = Builder.CreateLShr(A0, Builder.getInt32(2));
+ Value *A2 = Builder.CreateAnd(A1, Builder.getInt32(1));
Value *Conv16 = Builder.CreateZExt(A2, IntTy);
Value *Or17 = Builder.CreateOr(AAddr0, Conv16);
Value *Inc = Builder.CreateAdd(Or17, Builder.getIntN(BitWidth, 1));
@@ -457,9 +457,9 @@ static void expandIToFP(Instruction *IToFP) {
Value *Extract = Builder.CreateLShr(Shr21, Builder.getIntN(BitWidth, 32));
Value *ExtractT62 = nullptr;
if (FloatWidth > 80)
- ExtractT62 = Builder.CreateTrunc(Sub1, Builder.getIntNTy(64));
+ ExtractT62 = Builder.CreateTrunc(Sub1, Builder.getInt64Ty());
else
- ExtractT62 = Builder.CreateTrunc(Extract, Builder.getIntNTy(32));
+ ExtractT62 = Builder.CreateTrunc(Extract, Builder.getInt32Ty());
Builder.CreateBr(IfEnd26);
// if.else:
@@ -475,7 +475,7 @@ static void expandIToFP(Instruction *IToFP) {
Value *Extract65 = Builder.CreateLShr(Shl26, Builder.getIntN(BitWidth, 32));
Value *ExtractT66 = nullptr;
if (FloatWidth > 80)
- ExtractT66 = Builder.CreateTrunc(Sub2, Builder.getIntNTy(64));
+ ExtractT66 = Builder.CreateTrunc(Sub2, Builder.getInt64Ty());
else
ExtractT66 = Builder.CreateTrunc(Extract65, Builder.getInt32Ty());
Builder.CreateBr(IfEnd26);
@@ -507,30 +507,29 @@ static void expandIToFP(Instruction *IToFP) {
Builder.getIntN(BitWidth, 63));
And29 = Builder.CreateAnd(Shr, Temp2, "and29");
} else {
- Value *Conv28 = Builder.CreateTrunc(Shr, Builder.getIntNTy(32));
+ Value *Conv28 = Builder.CreateTrunc(Shr, Builder.getInt32Ty());
And29 = Builder.CreateAnd(
- Conv28, ConstantInt::getSigned(Builder.getIntNTy(32), 0x80000000));
+ Conv28, ConstantInt::getSigned(Builder.getInt32Ty(), 0x80000000));
}
unsigned TempMod = FPMantissaWidth % 32;
Value *And34 = nullptr;
Value *Shl30 = nullptr;
if (FloatWidth > 80) {
TempMod += 32;
- Value *Add = Builder.CreateShl(AAddr1Off32, Builder.getIntN(64, TempMod));
+ Value *Add = Builder.CreateShl(AAddr1Off32, Builder.getInt64(TempMod));
Shl30 = Builder.CreateAdd(
- Add,
- Builder.getIntN(64, ((1ull << (62ull - TempMod)) - 1ull) << TempMod));
- And34 = Builder.CreateZExt(Shl30, Builder.getIntNTy(128));
+ Add, Builder.getInt64(((1ull << (62ull - TempMod)) - 1ull) << TempMod));
+ And34 = Builder.CreateZExt(Shl30, Builder.getInt128Ty());
} else {
- Value *Add = Builder.CreateShl(E0, Builder.getIntN(32, TempMod));
+ Value *Add = Builder.CreateShl(E0, Builder.getInt32(TempMod));
Shl30 = Builder.CreateAdd(
- Add, Builder.getIntN(32, ((1 << (30 - TempMod)) - 1) << TempMod));
+ Add, Builder.getInt32(((1 << (30 - TempMod)) - 1) << TempMod));
And34 = Builder.CreateAnd(FloatWidth > 32 ? AAddr1Off32 : AAddr1Off0,
- Builder.getIntN(32, (1 << TempMod) - 1));
+ Builder.getInt32((1 << TempMod) - 1));
}
Value *Or35 = nullptr;
if (FloatWidth > 80) {
- Value *And29Trunc = Builder.CreateTrunc(And29, Builder.getIntNTy(128));
+ Value *And29Trunc = Builder.CreateTrunc(And29, Builder.getInt128Ty());
Value *Or31 = Builder.CreateOr(And29Trunc, And34);
Value *Or34 = Builder.CreateShl(Or31, Builder.getIntN(128, 64));
Value *Temp3 = Builder.CreateShl(Builder.getIntN(128, 1),
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 1286af8..974fc40 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -1884,6 +1884,14 @@ unsigned GISelValueTracking::computeNumSignBits(Register R,
}
break;
}
+ case TargetOpcode::G_ASHR: {
+ Register Src1 = MI.getOperand(1).getReg();
+ Register Src2 = MI.getOperand(2).getReg();
+ FirstAnswer = computeNumSignBits(Src1, DemandedElts, Depth + 1);
+ if (auto C = getValidMinimumShiftAmount(Src2, DemandedElts, Depth + 1))
+ FirstAnswer = std::min<uint64_t>(FirstAnswer + *C, TyBits);
+ break;
+ }
case TargetOpcode::G_TRUNC: {
Register Src = MI.getOperand(1).getReg();
LLT SrcTy = MRI.getType(Src);
@@ -2053,6 +2061,64 @@ unsigned GISelValueTracking::computeNumSignBits(Register R, unsigned Depth) {
return computeNumSignBits(R, DemandedElts, Depth);
}
+std::optional<ConstantRange> GISelValueTracking::getValidShiftAmountRange(
+ Register R, const APInt &DemandedElts, unsigned Depth) {
+ // Shifting more than the bitwidth is not valid.
+ MachineInstr &MI = *MRI.getVRegDef(R);
+ unsigned Opcode = MI.getOpcode();
+
+ LLT Ty = MRI.getType(R);
+ unsigned BitWidth = Ty.getScalarSizeInBits();
+
+ if (Opcode == TargetOpcode::G_CONSTANT) {
+ const APInt &ShAmt = MI.getOperand(1).getCImm()->getValue();
+ if (ShAmt.uge(BitWidth))
+ return std::nullopt;
+ return ConstantRange(ShAmt);
+ }
+
+ if (Opcode == TargetOpcode::G_BUILD_VECTOR) {
+ const APInt *MinAmt = nullptr, *MaxAmt = nullptr;
+ for (unsigned I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
+ if (!DemandedElts[I])
+ continue;
+ MachineInstr *Op = MRI.getVRegDef(MI.getOperand(I + 1).getReg());
+ if (Op->getOpcode() != TargetOpcode::G_CONSTANT) {
+ MinAmt = MaxAmt = nullptr;
+ break;
+ }
+
+ const APInt &ShAmt = Op->getOperand(1).getCImm()->getValue();
+ if (ShAmt.uge(BitWidth))
+ return std::nullopt;
+ if (!MinAmt || MinAmt->ugt(ShAmt))
+ MinAmt = &ShAmt;
+ if (!MaxAmt || MaxAmt->ult(ShAmt))
+ MaxAmt = &ShAmt;
+ }
+ assert(((!MinAmt && !MaxAmt) || (MinAmt && MaxAmt)) &&
+ "Failed to find matching min/max shift amounts");
+ if (MinAmt && MaxAmt)
+ return ConstantRange(*MinAmt, *MaxAmt + 1);
+ }
+
+ // Use computeKnownBits to find a hidden constant/knownbits (usually type
+ // legalized). e.g. Hidden behind multiple bitcasts/build_vector/casts etc.
+ KnownBits KnownAmt = getKnownBits(R, DemandedElts, Depth);
+ if (KnownAmt.getMaxValue().ult(BitWidth))
+ return ConstantRange::fromKnownBits(KnownAmt, /*IsSigned=*/false);
+
+ return std::nullopt;
+}
+
+std::optional<uint64_t> GISelValueTracking::getValidMinimumShiftAmount(
+ Register R, const APInt &DemandedElts, unsigned Depth) {
+ if (std::optional<ConstantRange> AmtRange =
+ getValidShiftAmountRange(R, DemandedElts, Depth))
+ return AmtRange->getUnsignedMin().getZExtValue();
+ return std::nullopt;
+}
+
void GISelValueTrackingAnalysisLegacy::getAnalysisUsage(
AnalysisUsage &AU) const {
AU.setPreservesAll();
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index d7280ea..dc5dfab 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2189,23 +2189,11 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
unsigned Op = ID == Intrinsic::lifetime_start ? TargetOpcode::LIFETIME_START
: TargetOpcode::LIFETIME_END;
- // Get the underlying objects for the location passed on the lifetime
- // marker.
- SmallVector<const Value *, 4> Allocas;
- getUnderlyingObjects(CI.getArgOperand(1), Allocas);
-
- // Iterate over each underlying object, creating lifetime markers for each
- // static alloca. Quit if we find a non-static alloca.
- for (const Value *V : Allocas) {
- const AllocaInst *AI = dyn_cast<AllocaInst>(V);
- if (!AI)
- continue;
-
- if (!AI->isStaticAlloca())
- return true;
+ const AllocaInst *AI = cast<AllocaInst>(CI.getArgOperand(1));
+ if (!AI->isStaticAlloca())
+ return true;
- MIRBuilder.buildInstr(Op).addFrameIndex(getOrCreateFrameIndex(*AI));
- }
+ MIRBuilder.buildInstr(Op).addFrameIndex(getOrCreateFrameIndex(*AI));
return true;
}
case Intrinsic::fake_use: {
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 11b3ac8..ed7b07f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -10120,14 +10120,10 @@ LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
return Legalized;
}
- bool IsVolatile = MemOp->isVolatile();
- // Don't try to optimize volatile.
- if (IsVolatile)
- return UnableToLegalize;
-
if (MaxLen && KnownLen > MaxLen)
return UnableToLegalize;
+ bool IsVolatile = MemOp->isVolatile();
if (Opc == TargetOpcode::G_MEMCPY) {
auto &MF = *MI.getParent()->getParent();
const auto &TLI = *MF.getSubtarget().getTargetLowering();
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index d2b2edf..c2839d4 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -253,6 +253,21 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
return false;
}
+static Value *getMaskOperand(IntrinsicInst *II) {
+ switch (II->getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unexpected intrinsic");
+ case Intrinsic::vp_load:
+ return II->getOperand(1);
+ case Intrinsic::masked_load:
+ return II->getOperand(2);
+ case Intrinsic::vp_store:
+ return II->getOperand(2);
+ case Intrinsic::masked_store:
+ return II->getOperand(3);
+ }
+}
+
// Return the corresponded deinterleaved mask, or nullptr if there is no valid
// mask.
static Value *getMask(Value *WideMask, unsigned Factor,
@@ -268,17 +283,13 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
if (isa<ScalableVectorType>(Load->getType()))
return false;
- if (auto *LI = dyn_cast<LoadInst>(Load)) {
- if (!LI->isSimple())
- return false;
- } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
- assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
- // Require a constant mask.
- if (!isa<ConstantVector>(VPLoad->getMaskParam()))
- return false;
- } else {
- llvm_unreachable("unsupported load operation");
- }
+ auto *LI = dyn_cast<LoadInst>(Load);
+ auto *II = dyn_cast<IntrinsicInst>(Load);
+ if (!LI && !II)
+ return false;
+
+ if (LI && !LI->isSimple())
+ return false;
// Check if all users of this load are shufflevectors. If we encounter any
// users that are extractelement instructions or binary operators, we save
@@ -330,7 +341,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
// Holds the corresponding index for each DE-interleave shuffle.
SmallVector<unsigned, 4> Indices;
- Type *VecTy = FirstSVI->getType();
+ VectorType *VecTy = cast<VectorType>(FirstSVI->getType());
// Check if other shufflevectors are also DE-interleaved of the same type
// and factor as the first shufflevector.
@@ -368,13 +379,16 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
Value *Mask = nullptr;
- if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
- Mask = getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
+ if (LI) {
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
+ } else {
+ // Check mask operand. Handle both all-true/false and interleaved mask.
+ Mask = getMask(getMaskOperand(II), Factor, VecTy);
if (!Mask)
return false;
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n");
- } else {
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
+
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: "
+ << *Load << "\n");
}
// Try to create target specific intrinsics to replace the load and
@@ -491,18 +505,16 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
bool InterleavedAccessImpl::lowerInterleavedStore(
Instruction *Store, SmallSetVector<Instruction *, 32> &DeadInsts) {
Value *StoredValue;
- if (auto *SI = dyn_cast<StoreInst>(Store)) {
+ auto *SI = dyn_cast<StoreInst>(Store);
+ auto *II = dyn_cast<IntrinsicInst>(Store);
+ if (SI) {
if (!SI->isSimple())
return false;
StoredValue = SI->getValueOperand();
- } else if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
- assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
- // Require a constant mask.
- if (!isa<ConstantVector>(VPStore->getMaskParam()))
- return false;
- StoredValue = VPStore->getArgOperand(0);
} else {
- llvm_unreachable("unsupported store operation");
+ assert(II->getIntrinsicID() == Intrinsic::vp_store ||
+ II->getIntrinsicID() == Intrinsic::masked_store);
+ StoredValue = II->getArgOperand(0);
}
auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
@@ -518,46 +530,26 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
assert(NumStoredElements % Factor == 0 &&
"number of stored element should be a multiple of Factor");
- if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
+ Value *Mask = nullptr;
+ if (SI) {
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
+ } else {
+ // Check mask operand. Handle both all-true/false and interleaved mask.
unsigned LaneMaskLen = NumStoredElements / Factor;
- Value *LaneMask = getMask(VPStore->getMaskParam(), Factor,
- ElementCount::getFixed(LaneMaskLen));
- if (!LaneMask)
+ Mask = getMask(getMaskOperand(II), Factor,
+ ElementCount::getFixed(LaneMaskLen));
+ if (!Mask)
return false;
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store
- << "\n");
-
- IRBuilder<> Builder(VPStore);
- // We need to effectively de-interleave the shufflemask
- // because lowerInterleavedVPStore expects individual de-interleaved
- // values.
- SmallVector<Value *, 10> NewShuffles;
- SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
- auto ShuffleMask = SVI->getShuffleMask();
-
- for (unsigned i = 0; i < Factor; i++) {
- for (unsigned j = 0; j < LaneMaskLen; j++)
- NewShuffleMask[j] = ShuffleMask[i + Factor * j];
-
- NewShuffles.push_back(Builder.CreateShuffleVector(
- SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
- }
-
- // Try to create target specific intrinsics to replace the vp.store and
- // shuffle.
- if (!TLI->lowerInterleavedVPStore(VPStore, LaneMask, NewShuffles))
- // We already created new shuffles.
- return true;
- } else {
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
-
- // Try to create target specific intrinsics to replace the store and
- // shuffle.
- if (!TLI->lowerInterleavedStore(cast<StoreInst>(Store), SVI, Factor))
- return false;
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: "
+ << *Store << "\n");
}
+ // Try to create target specific intrinsics to replace the store and
+ // shuffle.
+ if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor))
+ return false;
+
// Already have a new target specific interleaved store. Erase the old store.
DeadInsts.insert(Store);
DeadInsts.insert(SVI);
@@ -595,92 +587,116 @@ static Value *getMask(Value *WideMask, unsigned Factor,
}
}
+ if (auto *SVI = dyn_cast<ShuffleVectorInst>(WideMask)) {
+ // Check that the shuffle mask is: a) an interleave, b) all of the same
+ // set of the elements, and c) contained by the first source. (c) could
+ // be relaxed if desired.
+ unsigned NumSrcElts =
+ cast<FixedVectorType>(SVI->getOperand(1)->getType())->getNumElements();
+ SmallVector<unsigned> StartIndexes;
+ if (ShuffleVectorInst::isInterleaveMask(SVI->getShuffleMask(), Factor,
+ NumSrcElts * 2, StartIndexes) &&
+ llvm::all_of(StartIndexes, [](unsigned Start) { return Start == 0; }) &&
+ llvm::all_of(SVI->getShuffleMask(), [&NumSrcElts](int Idx) {
+ return Idx < (int)NumSrcElts;
+ })) {
+ auto *LeafMaskTy =
+ VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC);
+ IRBuilder<> Builder(SVI);
+ return Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
+ uint64_t(0));
+ }
+ }
+
return nullptr;
}
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
- Value *LoadedVal = DI->getOperand(0);
- if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
+ Instruction *LoadedVal = dyn_cast<Instruction>(DI->getOperand(0));
+ if (!LoadedVal || !LoadedVal->hasOneUse())
+ return false;
+
+ auto *LI = dyn_cast<LoadInst>(LoadedVal);
+ auto *II = dyn_cast<IntrinsicInst>(LoadedVal);
+ if (!LI && !II)
return false;
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
assert(Factor && "unexpected deinterleave intrinsic");
Value *Mask = nullptr;
- if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
- if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
- return false;
- // Check mask operand. Handle both all-true/false and interleaved mask.
- Value *WideMask = VPLoad->getOperand(1);
- Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI));
- if (!Mask)
- return false;
-
- LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic "
- << *DI << " and factor = " << Factor << "\n");
- } else {
- auto *LI = cast<LoadInst>(LoadedVal);
+ if (LI) {
if (!LI->isSimple())
return false;
LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
<< " and factor = " << Factor << "\n");
+ } else {
+ assert(II);
+
+ // Check mask operand. Handle both all-true/false and interleaved mask.
+ Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
+ if (!Mask)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
+ << " intrinsic " << *DI << " and factor = "
+ << Factor << "\n");
}
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(cast<Instruction>(LoadedVal), Mask,
- DI))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(LoadedVal, Mask, DI))
return false;
DeadInsts.insert(DI);
// We now have a target-specific load, so delete the old one.
- DeadInsts.insert(cast<Instruction>(LoadedVal));
+ DeadInsts.insert(LoadedVal);
return true;
}
bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
- IntrinsicInst *II, SmallSetVector<Instruction *, 32> &DeadInsts) {
- if (!II->hasOneUse())
+ IntrinsicInst *IntII, SmallSetVector<Instruction *, 32> &DeadInsts) {
+ if (!IntII->hasOneUse())
return false;
- Value *StoredBy = II->user_back();
- if (!isa<StoreInst, VPIntrinsic>(StoredBy))
+ Instruction *StoredBy = dyn_cast<Instruction>(IntII->user_back());
+ if (!StoredBy)
+ return false;
+ auto *SI = dyn_cast<StoreInst>(StoredBy);
+ auto *II = dyn_cast<IntrinsicInst>(StoredBy);
+ if (!SI && !II)
return false;
- SmallVector<Value *, 8> InterleaveValues(II->args());
- const unsigned Factor = getInterleaveIntrinsicFactor(II->getIntrinsicID());
+ SmallVector<Value *, 8> InterleaveValues(IntII->args());
+ const unsigned Factor = getInterleaveIntrinsicFactor(IntII->getIntrinsicID());
assert(Factor && "unexpected interleave intrinsic");
Value *Mask = nullptr;
- if (auto *VPStore = dyn_cast<VPIntrinsic>(StoredBy)) {
- if (VPStore->getIntrinsicID() != Intrinsic::vp_store)
- return false;
-
- Value *WideMask = VPStore->getOperand(2);
- Mask = getMask(WideMask, Factor,
+ if (II) {
+ // Check mask operand. Handle both all-true/false and interleaved mask.
+ Mask = getMask(getMaskOperand(II), Factor,
cast<VectorType>(InterleaveValues[0]->getType()));
if (!Mask)
return false;
- LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic "
- << *II << " and factor = " << Factor << "\n");
+ LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave"
+ << " intrinsic " << *IntII << " and factor = "
+ << Factor << "\n");
} else {
- auto *SI = cast<StoreInst>(StoredBy);
if (!SI->isSimple())
return false;
- LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II
- << " and factor = " << Factor << "\n");
+ LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic "
+ << *IntII << " and factor = " << Factor << "\n");
}
// Try and match this with target specific intrinsics.
- if (!TLI->lowerInterleaveIntrinsicToStore(cast<Instruction>(StoredBy), Mask,
- InterleaveValues))
+ if (!TLI->lowerInterleaveIntrinsicToStore(StoredBy, Mask, InterleaveValues))
return false;
// We now have a target-specific store, so delete the old one.
- DeadInsts.insert(cast<Instruction>(StoredBy));
- DeadInsts.insert(II);
+ DeadInsts.insert(StoredBy);
+ DeadInsts.insert(IntII);
return true;
}
@@ -692,11 +708,13 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
using namespace PatternMatch;
for (auto &I : instructions(F)) {
if (match(&I, m_CombineOr(m_Load(m_Value()),
- m_Intrinsic<Intrinsic::vp_load>())))
+ m_Intrinsic<Intrinsic::vp_load>())) ||
+ match(&I, m_Intrinsic<Intrinsic::masked_load>()))
Changed |= lowerInterleavedLoad(&I, DeadInsts);
if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
- m_Intrinsic<Intrinsic::vp_store>())))
+ m_Intrinsic<Intrinsic::vp_store>())) ||
+ match(&I, m_Intrinsic<Intrinsic::masked_store>()))
Changed |= lowerInterleavedStore(&I, DeadInsts);
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 7710b50..bc4e299 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -815,6 +815,9 @@ static void printMI(raw_ostream &OS, MFPrintState &State,
if (MI.getFlag(MachineInstr::SameSign))
OS << "samesign ";
+ // NOTE: Please add new MIFlags also to the MI_FLAGS_STR in
+ // llvm/utils/update_mir_test_checks.py.
+
OS << TII->getName(MI.getOpcode());
LS = ListSeparator();
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index 34896c6..4da0184 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -83,27 +83,6 @@ llvm::createUnpackMachineBundles(
return new UnpackMachineBundles(std::move(Ftor));
}
-namespace {
- class FinalizeMachineBundles : public MachineFunctionPass {
- public:
- static char ID; // Pass identification
- FinalizeMachineBundles() : MachineFunctionPass(ID) {
- initializeFinalizeMachineBundlesPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
- };
-} // end anonymous namespace
-
-char FinalizeMachineBundles::ID = 0;
-char &llvm::FinalizeMachineBundlesID = FinalizeMachineBundles::ID;
-INITIALIZE_PASS(FinalizeMachineBundles, "finalize-mi-bundles",
- "Finalize machine instruction bundles", false, false)
-
-bool FinalizeMachineBundles::runOnMachineFunction(MachineFunction &MF) {
- return llvm::finalizeBundles(MF);
-}
-
/// Return the first found DebugLoc that has a DILocation, given a range of
/// instructions. The search range is from FirstMI to LastMI (exclusive). If no
/// DILocation is found, then an empty location is returned.
@@ -359,3 +338,13 @@ PhysRegInfo llvm::AnalyzePhysRegInBundle(const MachineInstr &MI, Register Reg,
return PRI;
}
+
+PreservedAnalyses
+llvm::FinalizeBundleTestPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &) {
+ // For testing purposes, bundle the entire contents of each basic block
+ // except for terminators.
+ for (MachineBasicBlock &MBB : MF)
+ finalizeBundle(MBB, MBB.instr_begin(), MBB.getFirstInstrTerminator());
+ return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index e144111..286fbfd 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -49,7 +49,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
#include <cassert>
#include <limits>
#include <vector>
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index b38a4d1c..90005bd 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -4279,8 +4279,8 @@ void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
!TII->isGlobalMemoryObject(FromMI) &&
!TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
SDep Pred = Dep;
- Pred.setSUnit(Src);
- Dst->addPred(Pred);
+ Pred.setSUnit(From);
+ To->addPred(Pred);
}
}
}
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 76cba29..9d5c39c 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -771,24 +771,6 @@ static bool isSchedBoundary(MachineBasicBlock::iterator MI,
MI->isFakeUse();
}
-/// A region of an MBB for scheduling.
-namespace {
-struct SchedRegion {
- /// RegionBegin is the first instruction in the scheduling region, and
- /// RegionEnd is either MBB->end() or the scheduling boundary after the
- /// last instruction in the scheduling region. These iterators cannot refer
- /// to instructions outside of the identified scheduling region because
- /// those may be reordered before scheduling this region.
- MachineBasicBlock::iterator RegionBegin;
- MachineBasicBlock::iterator RegionEnd;
- unsigned NumRegionInstrs;
-
- SchedRegion(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E,
- unsigned N) :
- RegionBegin(B), RegionEnd(E), NumRegionInstrs(N) {}
-};
-} // end anonymous namespace
-
using MBBRegionsVector = SmallVector<SchedRegion, 16>;
static void
@@ -3725,7 +3707,8 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
RegionPolicy.OnlyBottomUp = true;
// Allow the subtarget to override default policy.
- MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs);
+ SchedRegion Region(Begin, End, NumRegionInstrs);
+ MF.getSubtarget().overrideSchedPolicy(RegionPolicy, Region);
// After subtarget overrides, apply command line options.
if (!EnableRegPressure) {
@@ -4338,7 +4321,8 @@ void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
RegionPolicy.OnlyBottomUp = false;
// Allow the subtarget to override default policy.
- MF.getSubtarget().overridePostRASchedPolicy(RegionPolicy, NumRegionInstrs);
+ SchedRegion Region(Begin, End, NumRegionInstrs);
+ MF.getSubtarget().overridePostRASchedPolicy(RegionPolicy, Region);
// After subtarget overrides, apply command line options.
if (PostRADirection == MISched::TopDown) {
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 9962070..908ed96 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -614,6 +614,13 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
Use &U = *AI->use_begin();
Instruction *User = cast<Instruction>(U.getUser());
+ // Drop lifetime markers now that this is no longer an alloca.
+ // SafeStack has already performed its own stack coloring.
+ if (User->isLifetimeStartOrEnd()) {
+ User->eraseFromParent();
+ continue;
+ }
+
Instruction *InsertBefore;
if (auto *PHI = dyn_cast<PHINode>(User))
InsertBefore = PHI->getIncomingBlock(U)->getTerminator();
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fed5e72..d3df434 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12375,11 +12375,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) {
// Any flags available in a select/setcc fold will be on the setcc as they
// migrated from fcmp
- Flags = N0->getFlags();
- SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1,
- N2, N0.getOperand(2));
- SelectNode->setFlags(Flags);
- return SelectNode;
+ return DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1, N2,
+ N0.getOperand(2), N0->getFlags());
}
if (SDValue ABD = foldSelectToABD(Cond0, Cond1, N1, N2, CC, DL))
@@ -16738,7 +16735,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
// Try to push freeze through instructions that propagate but don't produce
// poison as far as possible. If an operand of freeze follows three
- // conditions 1) one-use, and 2) does not produce poison then push
+ // conditions 1) one-use, 2) does not produce poison, and 3) has all but one
+ // guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push
// the freeze through to the operands that are not guaranteed non-poison.
// NOTE: we will strip poison-generating flags, so ignore them here.
if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false,
@@ -16746,6 +16744,18 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
N0->getNumValues() != 1 || !N0->hasOneUse())
return SDValue();
+ // TOOD: we should always allow multiple operands, however this increases the
+ // likelihood of infinite loops due to the ReplaceAllUsesOfValueWith call
+ // below causing later nodes that share frozen operands to fold again and no
+ // longer being able to confirm other operands are not poison due to recursion
+ // depth limits on isGuaranteedNotToBeUndefOrPoison.
+ bool AllowMultipleMaybePoisonOperands =
+ N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC ||
+ N0.getOpcode() == ISD::BUILD_VECTOR ||
+ N0.getOpcode() == ISD::BUILD_PAIR ||
+ N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
+ N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL;
+
// Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
// ones" or "constant" into something that depends on FrozenUndef. We can
// instead pick undef values to keep those properties, while at the same time
@@ -16772,8 +16782,16 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
/*Depth*/ 1))
continue;
- if (MaybePoisonOperands.insert(Op).second)
+ bool HadMaybePoisonOperands = !MaybePoisonOperands.empty();
+ bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second;
+ if (IsNewMaybePoisonOperand)
MaybePoisonOperandNumbers.push_back(OpNo);
+ if (!HadMaybePoisonOperands)
+ continue;
+ if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) {
+ // Multiple maybe-poison ops when not allowed - bail out.
+ return SDValue();
+ }
}
// NOTE: the whole op may be not guaranteed to not be undef or poison because
// it could create undef or poison due to it's poison-generating flags.
@@ -22727,11 +22745,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
const auto *LifetimeEnd = cast<LifetimeSDNode>(N);
- if (!LifetimeEnd->hasOffset())
- return SDValue();
-
- const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(),
- LifetimeEnd->getOffset(), false);
+ const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(), 0, false);
// We walk up the chains to find stores.
SmallVector<SDValue, 8> Chains = {N->getOperand(0)};
@@ -29418,9 +29432,8 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
return {false /*isVolatile*/,
/*isAtomic*/ false,
LN->getOperand(1),
- (LN->hasOffset()) ? LN->getOffset() : 0,
- (LN->hasOffset()) ? LocationSize::precise(LN->getSize())
- : LocationSize::beforeOrAfterPointer(),
+ 0,
+ LocationSize::precise(LN->getSize()),
(MachineMemOperand *)nullptr};
// Default.
return {false /*isvolatile*/,
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 85efb1b..8c8daef 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -402,7 +402,12 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB, SDValue Op,
AddRegisterOperand(MIB, Op, IIOpNum, II, VRBaseMap,
IsDebug, IsClone, IsCloned);
} else if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
- MIB.addImm(C->getSExtValue());
+ if (C->getAPIntValue().getSignificantBits() <= 64) {
+ MIB.addImm(C->getSExtValue());
+ } else {
+ MIB.addCImm(
+ ConstantInt::get(MF->getFunction().getContext(), C->getAPIntValue()));
+ }
} else if (ConstantFPSDNode *F = dyn_cast<ConstantFPSDNode>(Op)) {
MIB.addFPImm(F->getConstantFPValue());
} else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7266940..74172b2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2785,19 +2785,17 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
// In strict mode, we must avoid spurious exceptions, and therefore
// must make sure to only emit a single STRICT_SINT_TO_FP.
SDValue InCvt = DAG.getSelect(dl, SrcVT, SignBitTest, Or, Op0);
- Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, { DestVT, MVT::Other },
- { Node->getOperand(0), InCvt });
- Slow = DAG.getNode(ISD::STRICT_FADD, dl, { DestVT, MVT::Other },
- { Fast.getValue(1), Fast, Fast });
- Chain = Slow.getValue(1);
// The STRICT_SINT_TO_FP inherits the exception mode from the
// incoming STRICT_UINT_TO_FP node; the STRICT_FADD node can
// never raise any exception.
SDNodeFlags Flags;
Flags.setNoFPExcept(Node->getFlags().hasNoFPExcept());
- Fast->setFlags(Flags);
+ Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DestVT, MVT::Other},
+ {Node->getOperand(0), InCvt}, Flags);
Flags.setNoFPExcept(true);
- Slow->setFlags(Flags);
+ Slow = DAG.getNode(ISD::STRICT_FADD, dl, {DestVT, MVT::Other},
+ {Fast.getValue(1), Fast, Fast}, Flags);
+ Chain = Slow.getValue(1);
} else {
SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Or);
Slow = DAG.getNode(ISD::FADD, dl, DestVT, SignCvt, SignCvt);
@@ -3407,14 +3405,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
EVT VT = Operand.getValueType();
SDValue One = DAG.getConstantFP(1.0, dl, VT);
SDValue Chain = DAG.getEntryNode();
- SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
- {Chain, Operand, One});
-
// Propagate existing flags on canonicalize, and additionally set
// NoFPExcept.
SDNodeFlags CanonicalizeFlags = Node->getFlags();
CanonicalizeFlags.setNoFPExcept(true);
- Mul->setFlags(CanonicalizeFlags);
+ SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
+ {Chain, Operand, One}, CanonicalizeFlags);
Results.push_back(Mul);
break;
@@ -4150,15 +4146,14 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
Tmp2 = Node->getOperand(1);
Tmp3 = Node->getOperand(2);
if (Tmp1.getOpcode() == ISD::SETCC) {
- Tmp1 = DAG.getSelectCC(dl, Tmp1.getOperand(0), Tmp1.getOperand(1),
- Tmp2, Tmp3,
- cast<CondCodeSDNode>(Tmp1.getOperand(2))->get());
+ Tmp1 = DAG.getSelectCC(
+ dl, Tmp1.getOperand(0), Tmp1.getOperand(1), Tmp2, Tmp3,
+ cast<CondCodeSDNode>(Tmp1.getOperand(2))->get(), Node->getFlags());
} else {
- Tmp1 = DAG.getSelectCC(dl, Tmp1,
- DAG.getConstant(0, dl, Tmp1.getValueType()),
- Tmp2, Tmp3, ISD::SETNE);
+ Tmp1 =
+ DAG.getSelectCC(dl, Tmp1, DAG.getConstant(0, dl, Tmp1.getValueType()),
+ Tmp2, Tmp3, ISD::SETNE, Node->getFlags());
}
- Tmp1->setFlags(Node->getFlags());
Results.push_back(Tmp1);
break;
case ISD::BR_JT: {
@@ -4296,8 +4291,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
EVT Tmp1VT = Tmp1.getValueType();
Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, VT, Tmp1, Tmp2,
DAG.getBoolConstant(true, dl, VT, Tmp1VT),
- DAG.getBoolConstant(false, dl, VT, Tmp1VT), Tmp3);
- Tmp1->setFlags(Node->getFlags());
+ DAG.getBoolConstant(false, dl, VT, Tmp1VT), Tmp3,
+ Node->getFlags());
Results.push_back(Tmp1);
break;
}
@@ -4335,8 +4330,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
if (TLI.isCondCodeLegalOrCustom(InvCC, Tmp1.getSimpleValueType())) {
// Use the new condition code and swap true and false
Legalized = true;
- Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC);
- Tmp1->setFlags(Node->getFlags());
+ Tmp1 =
+ DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC, Node->getFlags());
} else {
// If The inverse is not legal, then try to swap the arguments using
// the inverse condition code.
@@ -4345,8 +4340,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
// The swapped inverse condition is legal, so swap true and false,
// lhs and rhs.
Legalized = true;
- Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC);
- Tmp1->setFlags(Node->getFlags());
+ Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC,
+ Node->getFlags());
}
}
@@ -4365,15 +4360,14 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
// If we expanded the SETCC by swapping LHS and RHS, or by inverting the
// condition code, create a new SELECT_CC node.
if (CC.getNode()) {
- Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0),
- Tmp1, Tmp2, Tmp3, Tmp4, CC);
+ Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1,
+ Tmp2, Tmp3, Tmp4, CC, Node->getFlags());
} else {
Tmp2 = DAG.getConstant(0, dl, Tmp1.getValueType());
CC = DAG.getCondCode(ISD::SETNE);
Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1,
- Tmp2, Tmp3, Tmp4, CC);
+ Tmp2, Tmp3, Tmp4, CC, Node->getFlags());
}
- Tmp1->setFlags(Node->getFlags());
}
Results.push_back(Tmp1);
break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index f908a66..d2ecc133 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -2087,11 +2087,10 @@ void VectorLegalizer::ExpandSETCC(SDNode *Node,
// Otherwise, SETCC for the given comparison type must be completely
// illegal; expand it into a SELECT_CC.
EVT VT = Node->getValueType(0);
- LHS =
- DAG.getNode(ISD::SELECT_CC, dl, VT, LHS, RHS,
- DAG.getBoolConstant(true, dl, VT, LHS.getValueType()),
- DAG.getBoolConstant(false, dl, VT, LHS.getValueType()), CC);
- LHS->setFlags(Node->getFlags());
+ LHS = DAG.getNode(ISD::SELECT_CC, dl, VT, LHS, RHS,
+ DAG.getBoolConstant(true, dl, VT, LHS.getValueType()),
+ DAG.getBoolConstant(false, dl, VT, LHS.getValueType()),
+ CC, Node->getFlags());
}
Results.push_back(LHS);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 32c5961..1661814 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -372,9 +372,9 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_OverflowOp(SDNode *N,
SDVTList ScalarVTs = DAG.getVTList(
ResVT.getVectorElementType(), OvVT.getVectorElementType());
- SDNode *ScalarNode = DAG.getNode(
- N->getOpcode(), DL, ScalarVTs, ScalarLHS, ScalarRHS).getNode();
- ScalarNode->setFlags(N->getFlags());
+ SDNode *ScalarNode = DAG.getNode(N->getOpcode(), DL, ScalarVTs,
+ {ScalarLHS, ScalarRHS}, N->getFlags())
+ .getNode();
// Replace the other vector result not being explicitly scalarized here.
unsigned OtherNo = 1 - ResNo;
@@ -1898,7 +1898,7 @@ SDValue DAGTypeLegalizer::UnrollVectorOp_StrictFP(SDNode *N, unsigned ResNE) {
NE = ResNE;
//The results of each unrolled operation, including the chain.
- EVT ChainVTs[] = {EltVT, MVT::Other};
+ SDVTList ChainVTs = DAG.getVTList(EltVT, MVT::Other);
SmallVector<SDValue, 8> Chains;
unsigned i;
@@ -1914,8 +1914,8 @@ SDValue DAGTypeLegalizer::UnrollVectorOp_StrictFP(SDNode *N, unsigned ResNE) {
Operands[j] = Operand;
}
}
- SDValue Scalar = DAG.getNode(N->getOpcode(), dl, ChainVTs, Operands);
- Scalar.getNode()->setFlags(N->getFlags());
+ SDValue Scalar =
+ DAG.getNode(N->getOpcode(), dl, ChainVTs, Operands, N->getFlags());
//Add in the scalar as well as its chain value to the
//result vectors.
@@ -1956,10 +1956,10 @@ void DAGTypeLegalizer::SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo,
unsigned Opcode = N->getOpcode();
SDVTList LoVTs = DAG.getVTList(LoResVT, LoOvVT);
SDVTList HiVTs = DAG.getVTList(HiResVT, HiOvVT);
- SDNode *LoNode = DAG.getNode(Opcode, dl, LoVTs, LoLHS, LoRHS).getNode();
- SDNode *HiNode = DAG.getNode(Opcode, dl, HiVTs, HiLHS, HiRHS).getNode();
- LoNode->setFlags(N->getFlags());
- HiNode->setFlags(N->getFlags());
+ SDNode *LoNode =
+ DAG.getNode(Opcode, dl, LoVTs, {LoLHS, LoRHS}, N->getFlags()).getNode();
+ SDNode *HiNode =
+ DAG.getNode(Opcode, dl, HiVTs, {HiLHS, HiRHS}, N->getFlags()).getNode();
Lo = SDValue(LoNode, ResNo);
Hi = SDValue(HiNode, ResNo);
@@ -2669,10 +2669,8 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOpWithTwoResults(SDNode *N,
else
std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
- Lo = DAG.getNode(N->getOpcode(), dl, {LoVT, LoVT1}, Lo);
- Hi = DAG.getNode(N->getOpcode(), dl, {HiVT, HiVT1}, Hi);
- Lo->setFlags(N->getFlags());
- Hi->setFlags(N->getFlags());
+ Lo = DAG.getNode(N->getOpcode(), dl, {LoVT, LoVT1}, Lo, N->getFlags());
+ Hi = DAG.getNode(N->getOpcode(), dl, {HiVT, HiVT1}, Hi, N->getFlags());
SDNode *HiNode = Hi.getNode();
SDNode *LoNode = Lo.getNode();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 2458115..773ff48 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -786,10 +786,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
break;
case ISD::LIFETIME_START:
case ISD::LIFETIME_END:
- if (cast<LifetimeSDNode>(N)->hasOffset()) {
- ID.AddInteger(cast<LifetimeSDNode>(N)->getSize());
- ID.AddInteger(cast<LifetimeSDNode>(N)->getOffset());
- }
+ ID.AddInteger(cast<LifetimeSDNode>(N)->getSize());
break;
case ISD::PSEUDO_PROBE:
ID.AddInteger(cast<PseudoProbeSDNode>(N)->getGuid());
@@ -3036,7 +3033,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
return TLI->isSplatValueForTargetNode(V, DemandedElts, UndefElts, *this,
Depth);
break;
-}
+ }
// We don't support other cases than those above for scalable vectors at
// the moment.
@@ -9364,7 +9361,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
SDValue Chain, int FrameIndex,
- int64_t Size, int64_t Offset) {
+ int64_t Size) {
const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END;
const auto VTs = getVTList(MVT::Other);
SDValue Ops[2] = {
@@ -9377,13 +9374,12 @@ SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
AddNodeIDNode(ID, Opcode, VTs, Ops);
ID.AddInteger(FrameIndex);
ID.AddInteger(Size);
- ID.AddInteger(Offset);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
return SDValue(E, 0);
- LifetimeSDNode *N = newSDNode<LifetimeSDNode>(
- Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs, Size, Offset);
+ LifetimeSDNode *N = newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(),
+ dl.getDebugLoc(), VTs, Size);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
@@ -10563,7 +10559,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
ArrayRef<SDUse> Ops) {
switch (Ops.size()) {
case 0: return getNode(Opcode, DL, VT);
- case 1: return getNode(Opcode, DL, VT, static_cast<const SDValue>(Ops[0]));
+ case 1: return getNode(Opcode, DL, VT, Ops[0].get());
case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
default: break;
@@ -10699,7 +10695,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) {
- return getNode(Opcode, DL, getVTList(ResultTys), Ops);
+ SDNodeFlags Flags;
+ if (Inserter)
+ Flags = Inserter->getFlags();
+ return getNode(Opcode, DL, getVTList(ResultTys), Ops, Flags);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
+ ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops,
+ const SDNodeFlags Flags) {
+ return getNode(Opcode, DL, getVTList(ResultTys), Ops, Flags);
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
@@ -10855,26 +10860,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
(Ops[2]->getAsZExtVal() == 0 || Ops[2]->getAsZExtVal() == 1) &&
"Invalid STRICT_FP_ROUND!");
break;
-#if 0
- // FIXME: figure out how to safely handle things like
- // int foo(int x) { return 1 << (x & 255); }
- // int bar() { return foo(256); }
- case ISD::SRA_PARTS:
- case ISD::SRL_PARTS:
- case ISD::SHL_PARTS:
- if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
- cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
- return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
- else if (N3.getOpcode() == ISD::AND)
- if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
- // If the and is only masking out bits that cannot effect the shift,
- // eliminate the and.
- unsigned NumBits = VT.getScalarSizeInBits()*2;
- if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
- return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
- }
- break;
-#endif
}
// Memoize the node unless it returns a glue result.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index da92aaa..8f08046 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -303,10 +303,7 @@ BaseIndexOffset BaseIndexOffset::match(const SDNode *N,
if (const auto *LS0 = dyn_cast<LSBaseSDNode>(N))
return matchLSNode(LS0, DAG);
if (const auto *LN = dyn_cast<LifetimeSDNode>(N)) {
- if (LN->hasOffset())
- return BaseIndexOffset(LN->getOperand(1), SDValue(), LN->getOffset(),
- false);
- return BaseIndexOffset(LN->getOperand(1), SDValue(), false);
+ return BaseIndexOffset(LN->getOperand(1), SDValue(), 0, false);
}
return BaseIndexOffset();
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 01e5312..1636465 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7596,32 +7596,17 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
const int64_t ObjectSize =
cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
- Value *const ObjectPtr = I.getArgOperand(1);
- SmallVector<const Value *, 4> Allocas;
- getUnderlyingObjects(ObjectPtr, Allocas);
+ const AllocaInst *LifetimeObject = cast<AllocaInst>(I.getArgOperand(1));
- for (const Value *Alloca : Allocas) {
- const AllocaInst *LifetimeObject = dyn_cast_or_null<AllocaInst>(Alloca);
-
- // Could not find an Alloca.
- if (!LifetimeObject)
- continue;
-
- // First check that the Alloca is static, otherwise it won't have a
- // valid frame index.
- auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
- if (SI == FuncInfo.StaticAllocaMap.end())
- return;
+ // First check that the Alloca is static, otherwise it won't have a
+ // valid frame index.
+ auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
+ if (SI == FuncInfo.StaticAllocaMap.end())
+ return;
- const int FrameIndex = SI->second;
- int64_t Offset;
- if (GetPointerBaseWithConstantOffset(
- ObjectPtr, Offset, DAG.getDataLayout()) != LifetimeObject)
- Offset = -1; // Cannot determine offset from alloca to lifetime object.
- Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize,
- Offset);
- DAG.setRoot(Res);
- }
+ const int FrameIndex = SI->second;
+ Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize);
+ DAG.setRoot(Res);
return;
}
case Intrinsic::pseudoprobe: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 7fc1558..9474587 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -947,8 +947,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
<< ASC->getDestAddressSpace()
<< ']';
} else if (const LifetimeSDNode *LN = dyn_cast<LifetimeSDNode>(this)) {
- if (LN->hasOffset())
- OS << "<" << LN->getOffset() << " to " << LN->getOffset() + LN->getSize() << ">";
+ OS << "<0 to " << LN->getSize() << ">";
} else if (const auto *AA = dyn_cast<AssertAlignSDNode>(this)) {
OS << '<' << AA->getAlign().value() << '>';
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e059798..1764910 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -778,7 +778,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
case ISD::FREEZE: {
SDValue N0 = Op.getOperand(0);
if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
- /*PoisonOnly=*/false))
+ /*PoisonOnly=*/false, Depth + 1))
return N0;
break;
}
@@ -3369,7 +3369,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
case ISD::FREEZE: {
SDValue N0 = Op.getOperand(0);
if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
- /*PoisonOnly=*/false))
+ /*PoisonOnly=*/false,
+ Depth + 1))
return TLO.CombineTo(Op, N0);
// TODO: Replace this with the general fold from DAGCombiner::visitFREEZE
@@ -8128,7 +8129,7 @@ static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) {
return ISD::matchUnaryPredicate(
Z,
[=](ConstantSDNode *C) { return !C || C->getAPIntValue().urem(BW) != 0; },
- /*AllowUndef=*/true, /*AllowTruncation=*/true);
+ /*AllowUndefs=*/true, /*AllowTruncation=*/true);
}
static SDValue expandVPFunnelShift(SDNode *Node, SelectionDAG &DAG) {
@@ -8633,9 +8634,8 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node,
return SDValue();
SDValue Op1 = Node->getOperand(0);
SDValue Op2 = Node->getOperand(1);
- SDValue SelCC = DAG.getSelectCC(SDLoc(Node), Op1, Op2, Op1, Op2, Pred);
- SelCC->setFlags(Node->getFlags());
- return SelCC;
+ return DAG.getSelectCC(SDLoc(Node), Op1, Op2, Op1, Op2, Pred,
+ Node->getFlags());
}
return SDValue();
@@ -11994,8 +11994,7 @@ SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node,
// Get the mask value and add it to the current output position. This
// either increments by 1 if MaskI is true or adds 0 otherwise.
// Freeze in case we have poison/undef mask entries.
- SDValue MaskI =
- DAG.getFreeze(DAG.getExtractVectorElt(DL, MaskScalarVT, Mask, I));
+ SDValue MaskI = DAG.getExtractVectorElt(DL, MaskScalarVT, Mask, I);
MaskI = DAG.getFreeze(MaskI);
MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, PositionVT, MaskI);
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index b79911b..2a8234a 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -588,7 +588,14 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F,
continue;
Instruction *CheckLoc = dyn_cast<ReturnInst>(BB.getTerminator());
if (!CheckLoc && !DisableCheckNoReturn)
- for (auto &Inst : BB)
+ for (auto &Inst : BB) {
+ if (IntrinsicInst *IB = dyn_cast<IntrinsicInst>(&Inst);
+ IB && (IB->getIntrinsicID() == Intrinsic::eh_sjlj_callsite)) {
+ // eh_sjlj_callsite has to be in same BB as the
+ // bb terminator. Don't insert within this range.
+ CheckLoc = IB;
+ break;
+ }
if (auto *CB = dyn_cast<CallBase>(&Inst))
// Do stack check before noreturn calls that aren't nounwind (e.g:
// __cxa_throw).
@@ -596,6 +603,7 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F,
CheckLoc = CB;
break;
}
+ }
if (!CheckLoc)
continue;
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 7e501a9..a40ceaa 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -42,7 +42,6 @@
#include "llvm/IR/Mangler.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
-#include "llvm/IR/PseudoProbe.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAsmInfoDarwin.h"
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp
index 2abab02..4d879b6 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp
@@ -8,12 +8,9 @@
#include "llvm/DebugInfo/DWARF/DWARFCFIPrinter.h"
#include "llvm/DebugInfo/DIContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
#include "llvm/DebugInfo/DWARF/DWARFExpressionPrinter.h"
#include "llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h"
#include "llvm/Support/Compiler.h"
-#include "llvm/Support/DataExtractor.h"
-#include "llvm/Support/Errc.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp b/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp
index 7072418..9a7f7d1 100644
--- a/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp
+++ b/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp
@@ -7,8 +7,6 @@
//===----------------------------------------------------------------------===//
#include "llvm/DebugInfo/DWARF/LowLevel/DWARFExpression.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/Support/Format.h"
#include <cassert>
#include <cstdint>
#include <vector>
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index cca9959..ffc7696 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -738,6 +738,32 @@ static inline uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo) {
return Hi == 63 ? Val >> Lo : (Val & (((1ULL << (Hi + 1)) - 1))) >> Lo;
}
+// Calculate the adjusted page delta between dest and PC. The code is copied
+// from lld and see comments there for more details.
+static uint64_t getLoongArchPageDelta(uint64_t dest, uint64_t pc,
+ uint32_t type) {
+ uint64_t pcalau12i_pc;
+ switch (type) {
+ case ELF::R_LARCH_PCALA64_LO20:
+ case ELF::R_LARCH_GOT64_PC_LO20:
+ pcalau12i_pc = pc - 8;
+ break;
+ case ELF::R_LARCH_PCALA64_HI12:
+ case ELF::R_LARCH_GOT64_PC_HI12:
+ pcalau12i_pc = pc - 12;
+ break;
+ default:
+ pcalau12i_pc = pc;
+ break;
+ }
+ uint64_t result = (dest & ~0xfffULL) - (pcalau12i_pc & ~0xfffULL);
+ if (dest & 0x800)
+ result += 0x1000 - 0x1'0000'0000;
+ if (result & 0x8000'0000)
+ result += 0x1'0000'0000;
+ return result;
+}
+
void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section,
uint64_t Offset,
uint64_t Value, uint32_t Type,
@@ -789,10 +815,7 @@ void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section,
case ELF::R_LARCH_GOT_PC_HI20:
case ELF::R_LARCH_PCALA_HI20: {
uint64_t Target = Value + Addend;
- uint64_t TargetPage =
- (Target + (Target & 0x800)) & ~static_cast<uint64_t>(0xfff);
- uint64_t PCPage = FinalAddress & ~static_cast<uint64_t>(0xfff);
- int64_t PageDelta = TargetPage - PCPage;
+ int64_t PageDelta = getLoongArchPageDelta(Target, FinalAddress, Type);
auto Instr = support::ulittle32_t::ref(TargetPtr);
uint32_t Imm31_12 = extractBits(PageDelta, /*Hi=*/31, /*Lo=*/12) << 5;
Instr = (Instr & 0xfe00001f) | Imm31_12;
@@ -806,6 +829,24 @@ void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section,
Instr = (Instr & 0xffc003ff) | Imm11_0;
break;
}
+ case ELF::R_LARCH_GOT64_PC_LO20:
+ case ELF::R_LARCH_PCALA64_LO20: {
+ uint64_t Target = Value + Addend;
+ int64_t PageDelta = getLoongArchPageDelta(Target, FinalAddress, Type);
+ auto Instr = support::ulittle32_t::ref(TargetPtr);
+ uint32_t Imm51_32 = extractBits(PageDelta, /*Hi=*/51, /*Lo=*/32) << 5;
+ Instr = (Instr & 0xfe00001f) | Imm51_32;
+ break;
+ }
+ case ELF::R_LARCH_GOT64_PC_HI12:
+ case ELF::R_LARCH_PCALA64_HI12: {
+ uint64_t Target = Value + Addend;
+ int64_t PageDelta = getLoongArchPageDelta(Target, FinalAddress, Type);
+ auto Instr = support::ulittle32_t::ref(TargetPtr);
+ uint32_t Imm63_52 = extractBits(PageDelta, /*Hi=*/63, /*Lo=*/52) << 10;
+ Instr = (Instr & 0xffc003ff) | Imm63_52;
+ break;
+ }
case ELF::R_LARCH_ABS_HI20: {
uint64_t Target = Value + Addend;
auto Instr = support::ulittle32_t::ref(TargetPtr);
@@ -1758,7 +1799,9 @@ RuntimeDyldELF::processRelocationRef(
MemMgr.allowStubAllocation()) {
resolveLoongArch64Branch(SectionID, Value, RelI, Stubs);
} else if (RelType == ELF::R_LARCH_GOT_PC_HI20 ||
- RelType == ELF::R_LARCH_GOT_PC_LO12) {
+ RelType == ELF::R_LARCH_GOT_PC_LO12 ||
+ RelType == ELF::R_LARCH_GOT64_PC_HI12 ||
+ RelType == ELF::R_LARCH_GOT64_PC_LO20) {
uint64_t GOTOffset = findOrAllocGOTEntry(Value, ELF::R_LARCH_64);
resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
RelType);
@@ -2936,7 +2979,9 @@ bool RuntimeDyldELF::relocationNeedsGot(const RelocationRef &R) const {
if (Arch == Triple::loongarch64)
return RelTy == ELF::R_LARCH_GOT_PC_HI20 ||
- RelTy == ELF::R_LARCH_GOT_PC_LO12;
+ RelTy == ELF::R_LARCH_GOT_PC_LO12 ||
+ RelTy == ELF::R_LARCH_GOT64_PC_HI12 ||
+ RelTy == ELF::R_LARCH_GOT64_PC_LO20;
if (Arch == Triple::x86_64)
return RelTy == ELF::R_X86_64_GOTPCREL ||
diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index b79f6ec..ce35a5b 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -1360,6 +1360,12 @@ void Pattern::printFuzzyMatch(const SourceMgr &SM, StringRef Buffer,
size_t Best = StringRef::npos;
double BestQuality = 0;
+ // Arbitrarily limit quadratic search behavior stemming from long CHECK lines.
+ if (size_t(4096) * size_t(2048) <
+ std::min(size_t(4096), Buffer.size()) *
+ std::max(FixedStr.size(), RegExStr.size()))
+ return;
+
// Use an arbitrary 4k limit on how far we will search.
for (size_t i = 0, e = std::min(size_t(4096), Buffer.size()); i != e; ++i) {
if (Buffer[i] == '\n')
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
index f7669f0..53f5934 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
@@ -12,6 +12,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/Frontend/HLSL/RootSignatureMetadata.h"
+#include "llvm/Frontend/HLSL/RootSignatureValidations.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/Support/ScopedPrinter.h"
@@ -20,6 +22,42 @@ namespace llvm {
namespace hlsl {
namespace rootsig {
+static std::optional<uint32_t> extractMdIntValue(MDNode *Node,
+ unsigned int OpId) {
+ if (auto *CI =
+ mdconst::dyn_extract<ConstantInt>(Node->getOperand(OpId).get()))
+ return CI->getZExtValue();
+ return std::nullopt;
+}
+
+static std::optional<float> extractMdFloatValue(MDNode *Node,
+ unsigned int OpId) {
+ if (auto *CI = mdconst::dyn_extract<ConstantFP>(Node->getOperand(OpId).get()))
+ return CI->getValueAPF().convertToFloat();
+ return std::nullopt;
+}
+
+static std::optional<StringRef> extractMdStringValue(MDNode *Node,
+ unsigned int OpId) {
+ MDString *NodeText = dyn_cast<MDString>(Node->getOperand(OpId));
+ if (NodeText == nullptr)
+ return std::nullopt;
+ return NodeText->getString();
+}
+
+static bool reportError(LLVMContext *Ctx, Twine Message,
+ DiagnosticSeverity Severity = DS_Error) {
+ Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity));
+ return true;
+}
+
+static bool reportValueError(LLVMContext *Ctx, Twine ParamName,
+ uint32_t Value) {
+ Ctx->diagnose(DiagnosticInfoGeneric(
+ "Invalid value for " + ParamName + ": " + Twine(Value), DS_Error));
+ return true;
+}
+
static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
{"CBV", dxil::ResourceClass::CBuffer},
{"SRV", dxil::ResourceClass::SRV},
@@ -189,6 +227,442 @@ MDNode *MetadataBuilder::BuildStaticSampler(const StaticSampler &Sampler) {
return MDNode::get(Ctx, Operands);
}
+bool MetadataParser::parseRootFlags(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *RootFlagNode) {
+
+ if (RootFlagNode->getNumOperands() != 2)
+ return reportError(Ctx, "Invalid format for RootFlag Element");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootFlagNode, 1))
+ RSD.Flags = *Val;
+ else
+ return reportError(Ctx, "Invalid value for RootFlag");
+
+ return false;
+}
+
+bool MetadataParser::parseRootConstants(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *RootConstantNode) {
+
+ if (RootConstantNode->getNumOperands() != 5)
+ return reportError(Ctx, "Invalid format for RootConstants Element");
+
+ dxbc::RTS0::v1::RootParameterHeader Header;
+ // The parameter offset doesn't matter here - we recalculate it during
+ // serialization Header.ParameterOffset = 0;
+ Header.ParameterType =
+ llvm::to_underlying(dxbc::RootParameterType::Constants32Bit);
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 1))
+ Header.ShaderVisibility = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderVisibility");
+
+ dxbc::RTS0::v1::RootConstants Constants;
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 2))
+ Constants.ShaderRegister = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderRegister");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 3))
+ Constants.RegisterSpace = *Val;
+ else
+ return reportError(Ctx, "Invalid value for RegisterSpace");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 4))
+ Constants.Num32BitValues = *Val;
+ else
+ return reportError(Ctx, "Invalid value for Num32BitValues");
+
+ RSD.ParametersContainer.addParameter(Header, Constants);
+
+ return false;
+}
+
+bool MetadataParser::parseRootDescriptors(
+ LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
+ MDNode *RootDescriptorNode, RootSignatureElementKind ElementKind) {
+ assert(ElementKind == RootSignatureElementKind::SRV ||
+ ElementKind == RootSignatureElementKind::UAV ||
+ ElementKind == RootSignatureElementKind::CBV &&
+ "parseRootDescriptors should only be called with RootDescriptor "
+ "element kind.");
+ if (RootDescriptorNode->getNumOperands() != 5)
+ return reportError(Ctx, "Invalid format for Root Descriptor Element");
+
+ dxbc::RTS0::v1::RootParameterHeader Header;
+ switch (ElementKind) {
+ case RootSignatureElementKind::SRV:
+ Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::SRV);
+ break;
+ case RootSignatureElementKind::UAV:
+ Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::UAV);
+ break;
+ case RootSignatureElementKind::CBV:
+ Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::CBV);
+ break;
+ default:
+ llvm_unreachable("invalid Root Descriptor kind");
+ break;
+ }
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 1))
+ Header.ShaderVisibility = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderVisibility");
+
+ dxbc::RTS0::v2::RootDescriptor Descriptor;
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 2))
+ Descriptor.ShaderRegister = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderRegister");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 3))
+ Descriptor.RegisterSpace = *Val;
+ else
+ return reportError(Ctx, "Invalid value for RegisterSpace");
+
+ if (RSD.Version == 1) {
+ RSD.ParametersContainer.addParameter(Header, Descriptor);
+ return false;
+ }
+ assert(RSD.Version > 1);
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 4))
+ Descriptor.Flags = *Val;
+ else
+ return reportError(Ctx, "Invalid value for Root Descriptor Flags");
+
+ RSD.ParametersContainer.addParameter(Header, Descriptor);
+ return false;
+}
+
+bool MetadataParser::parseDescriptorRange(LLVMContext *Ctx,
+ mcdxbc::DescriptorTable &Table,
+ MDNode *RangeDescriptorNode) {
+
+ if (RangeDescriptorNode->getNumOperands() != 6)
+ return reportError(Ctx, "Invalid format for Descriptor Range");
+
+ dxbc::RTS0::v2::DescriptorRange Range;
+
+ std::optional<StringRef> ElementText =
+ extractMdStringValue(RangeDescriptorNode, 0);
+
+ if (!ElementText.has_value())
+ return reportError(Ctx, "Descriptor Range, first element is not a string.");
+
+ Range.RangeType =
+ StringSwitch<uint32_t>(*ElementText)
+ .Case("CBV", llvm::to_underlying(dxbc::DescriptorRangeType::CBV))
+ .Case("SRV", llvm::to_underlying(dxbc::DescriptorRangeType::SRV))
+ .Case("UAV", llvm::to_underlying(dxbc::DescriptorRangeType::UAV))
+ .Case("Sampler",
+ llvm::to_underlying(dxbc::DescriptorRangeType::Sampler))
+ .Default(~0U);
+
+ if (Range.RangeType == ~0U)
+ return reportError(Ctx, "Invalid Descriptor Range type: " + *ElementText);
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 1))
+ Range.NumDescriptors = *Val;
+ else
+ return reportError(Ctx, "Invalid value for Number of Descriptor in Range");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 2))
+ Range.BaseShaderRegister = *Val;
+ else
+ return reportError(Ctx, "Invalid value for BaseShaderRegister");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 3))
+ Range.RegisterSpace = *Val;
+ else
+ return reportError(Ctx, "Invalid value for RegisterSpace");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 4))
+ Range.OffsetInDescriptorsFromTableStart = *Val;
+ else
+ return reportError(Ctx,
+ "Invalid value for OffsetInDescriptorsFromTableStart");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 5))
+ Range.Flags = *Val;
+ else
+ return reportError(Ctx, "Invalid value for Descriptor Range Flags");
+
+ Table.Ranges.push_back(Range);
+ return false;
+}
+
+bool MetadataParser::parseDescriptorTable(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *DescriptorTableNode) {
+ const unsigned int NumOperands = DescriptorTableNode->getNumOperands();
+ if (NumOperands < 2)
+ return reportError(Ctx, "Invalid format for Descriptor Table");
+
+ dxbc::RTS0::v1::RootParameterHeader Header;
+ if (std::optional<uint32_t> Val = extractMdIntValue(DescriptorTableNode, 1))
+ Header.ShaderVisibility = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderVisibility");
+
+ mcdxbc::DescriptorTable Table;
+ Header.ParameterType =
+ llvm::to_underlying(dxbc::RootParameterType::DescriptorTable);
+
+ for (unsigned int I = 2; I < NumOperands; I++) {
+ MDNode *Element = dyn_cast<MDNode>(DescriptorTableNode->getOperand(I));
+ if (Element == nullptr)
+ return reportError(Ctx, "Missing Root Element Metadata Node.");
+
+ if (parseDescriptorRange(Ctx, Table, Element))
+ return true;
+ }
+
+ RSD.ParametersContainer.addParameter(Header, Table);
+ return false;
+}
+
+bool MetadataParser::parseStaticSampler(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *StaticSamplerNode) {
+ if (StaticSamplerNode->getNumOperands() != 14)
+ return reportError(Ctx, "Invalid format for Static Sampler");
+
+ dxbc::RTS0::v1::StaticSampler Sampler;
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 1))
+ Sampler.Filter = *Val;
+ else
+ return reportError(Ctx, "Invalid value for Filter");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 2))
+ Sampler.AddressU = *Val;
+ else
+ return reportError(Ctx, "Invalid value for AddressU");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 3))
+ Sampler.AddressV = *Val;
+ else
+ return reportError(Ctx, "Invalid value for AddressV");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 4))
+ Sampler.AddressW = *Val;
+ else
+ return reportError(Ctx, "Invalid value for AddressW");
+
+ if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 5))
+ Sampler.MipLODBias = *Val;
+ else
+ return reportError(Ctx, "Invalid value for MipLODBias");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 6))
+ Sampler.MaxAnisotropy = *Val;
+ else
+ return reportError(Ctx, "Invalid value for MaxAnisotropy");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 7))
+ Sampler.ComparisonFunc = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ComparisonFunc ");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 8))
+ Sampler.BorderColor = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ComparisonFunc ");
+
+ if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 9))
+ Sampler.MinLOD = *Val;
+ else
+ return reportError(Ctx, "Invalid value for MinLOD");
+
+ if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 10))
+ Sampler.MaxLOD = *Val;
+ else
+ return reportError(Ctx, "Invalid value for MaxLOD");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 11))
+ Sampler.ShaderRegister = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderRegister");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 12))
+ Sampler.RegisterSpace = *Val;
+ else
+ return reportError(Ctx, "Invalid value for RegisterSpace");
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 13))
+ Sampler.ShaderVisibility = *Val;
+ else
+ return reportError(Ctx, "Invalid value for ShaderVisibility");
+
+ RSD.StaticSamplers.push_back(Sampler);
+ return false;
+}
+
+bool MetadataParser::parseRootSignatureElement(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *Element) {
+ std::optional<StringRef> ElementText = extractMdStringValue(Element, 0);
+ if (!ElementText.has_value())
+ return reportError(Ctx, "Invalid format for Root Element");
+
+ RootSignatureElementKind ElementKind =
+ StringSwitch<RootSignatureElementKind>(*ElementText)
+ .Case("RootFlags", RootSignatureElementKind::RootFlags)
+ .Case("RootConstants", RootSignatureElementKind::RootConstants)
+ .Case("RootCBV", RootSignatureElementKind::CBV)
+ .Case("RootSRV", RootSignatureElementKind::SRV)
+ .Case("RootUAV", RootSignatureElementKind::UAV)
+ .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable)
+ .Case("StaticSampler", RootSignatureElementKind::StaticSamplers)
+ .Default(RootSignatureElementKind::Error);
+
+ switch (ElementKind) {
+
+ case RootSignatureElementKind::RootFlags:
+ return parseRootFlags(Ctx, RSD, Element);
+ case RootSignatureElementKind::RootConstants:
+ return parseRootConstants(Ctx, RSD, Element);
+ case RootSignatureElementKind::CBV:
+ case RootSignatureElementKind::SRV:
+ case RootSignatureElementKind::UAV:
+ return parseRootDescriptors(Ctx, RSD, Element, ElementKind);
+ case RootSignatureElementKind::DescriptorTable:
+ return parseDescriptorTable(Ctx, RSD, Element);
+ case RootSignatureElementKind::StaticSamplers:
+ return parseStaticSampler(Ctx, RSD, Element);
+ case RootSignatureElementKind::Error:
+ return reportError(Ctx, "Invalid Root Signature Element: " + *ElementText);
+ }
+
+ llvm_unreachable("Unhandled RootSignatureElementKind enum.");
+}
+
+bool MetadataParser::validateRootSignature(
+ LLVMContext *Ctx, const llvm::mcdxbc::RootSignatureDesc &RSD) {
+ if (!llvm::hlsl::rootsig::verifyVersion(RSD.Version)) {
+ return reportValueError(Ctx, "Version", RSD.Version);
+ }
+
+ if (!llvm::hlsl::rootsig::verifyRootFlag(RSD.Flags)) {
+ return reportValueError(Ctx, "RootFlags", RSD.Flags);
+ }
+
+ for (const mcdxbc::RootParameterInfo &Info : RSD.ParametersContainer) {
+ if (!dxbc::isValidShaderVisibility(Info.Header.ShaderVisibility))
+ return reportValueError(Ctx, "ShaderVisibility",
+ Info.Header.ShaderVisibility);
+
+ assert(dxbc::isValidParameterType(Info.Header.ParameterType) &&
+ "Invalid value for ParameterType");
+
+ switch (Info.Header.ParameterType) {
+
+ case llvm::to_underlying(dxbc::RootParameterType::CBV):
+ case llvm::to_underlying(dxbc::RootParameterType::UAV):
+ case llvm::to_underlying(dxbc::RootParameterType::SRV): {
+ const dxbc::RTS0::v2::RootDescriptor &Descriptor =
+ RSD.ParametersContainer.getRootDescriptor(Info.Location);
+ if (!llvm::hlsl::rootsig::verifyRegisterValue(Descriptor.ShaderRegister))
+ return reportValueError(Ctx, "ShaderRegister",
+ Descriptor.ShaderRegister);
+
+ if (!llvm::hlsl::rootsig::verifyRegisterSpace(Descriptor.RegisterSpace))
+ return reportValueError(Ctx, "RegisterSpace", Descriptor.RegisterSpace);
+
+ if (RSD.Version > 1) {
+ if (!llvm::hlsl::rootsig::verifyRootDescriptorFlag(RSD.Version,
+ Descriptor.Flags))
+ return reportValueError(Ctx, "RootDescriptorFlag", Descriptor.Flags);
+ }
+ break;
+ }
+ case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): {
+ const mcdxbc::DescriptorTable &Table =
+ RSD.ParametersContainer.getDescriptorTable(Info.Location);
+ for (const dxbc::RTS0::v2::DescriptorRange &Range : Table) {
+ if (!llvm::hlsl::rootsig::verifyRangeType(Range.RangeType))
+ return reportValueError(Ctx, "RangeType", Range.RangeType);
+
+ if (!llvm::hlsl::rootsig::verifyRegisterSpace(Range.RegisterSpace))
+ return reportValueError(Ctx, "RegisterSpace", Range.RegisterSpace);
+
+ if (!llvm::hlsl::rootsig::verifyNumDescriptors(Range.NumDescriptors))
+ return reportValueError(Ctx, "NumDescriptors", Range.NumDescriptors);
+
+ if (!llvm::hlsl::rootsig::verifyDescriptorRangeFlag(
+ RSD.Version, Range.RangeType, Range.Flags))
+ return reportValueError(Ctx, "DescriptorFlag", Range.Flags);
+ }
+ break;
+ }
+ }
+ }
+
+ for (const dxbc::RTS0::v1::StaticSampler &Sampler : RSD.StaticSamplers) {
+ if (!llvm::hlsl::rootsig::verifySamplerFilter(Sampler.Filter))
+ return reportValueError(Ctx, "Filter", Sampler.Filter);
+
+ if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressU))
+ return reportValueError(Ctx, "AddressU", Sampler.AddressU);
+
+ if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressV))
+ return reportValueError(Ctx, "AddressV", Sampler.AddressV);
+
+ if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressW))
+ return reportValueError(Ctx, "AddressW", Sampler.AddressW);
+
+ if (!llvm::hlsl::rootsig::verifyMipLODBias(Sampler.MipLODBias))
+ return reportValueError(Ctx, "MipLODBias", Sampler.MipLODBias);
+
+ if (!llvm::hlsl::rootsig::verifyMaxAnisotropy(Sampler.MaxAnisotropy))
+ return reportValueError(Ctx, "MaxAnisotropy", Sampler.MaxAnisotropy);
+
+ if (!llvm::hlsl::rootsig::verifyComparisonFunc(Sampler.ComparisonFunc))
+ return reportValueError(Ctx, "ComparisonFunc", Sampler.ComparisonFunc);
+
+ if (!llvm::hlsl::rootsig::verifyBorderColor(Sampler.BorderColor))
+ return reportValueError(Ctx, "BorderColor", Sampler.BorderColor);
+
+ if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MinLOD))
+ return reportValueError(Ctx, "MinLOD", Sampler.MinLOD);
+
+ if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MaxLOD))
+ return reportValueError(Ctx, "MaxLOD", Sampler.MaxLOD);
+
+ if (!llvm::hlsl::rootsig::verifyRegisterValue(Sampler.ShaderRegister))
+ return reportValueError(Ctx, "ShaderRegister", Sampler.ShaderRegister);
+
+ if (!llvm::hlsl::rootsig::verifyRegisterSpace(Sampler.RegisterSpace))
+ return reportValueError(Ctx, "RegisterSpace", Sampler.RegisterSpace);
+
+ if (!dxbc::isValidShaderVisibility(Sampler.ShaderVisibility))
+ return reportValueError(Ctx, "ShaderVisibility",
+ Sampler.ShaderVisibility);
+ }
+
+ return false;
+}
+
+bool MetadataParser::ParseRootSignature(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD) {
+ bool HasError = false;
+
+ // Loop through the Root Elements of the root signature.
+ for (const auto &Operand : Root->operands()) {
+ MDNode *Element = dyn_cast<MDNode>(Operand);
+ if (Element == nullptr)
+ return reportError(Ctx, "Missing Root Element Metadata Node.");
+
+ HasError = HasError || parseRootSignatureElement(Ctx, RSD, Element) ||
+ validateRootSignature(Ctx, RSD);
+ }
+
+ return HasError;
+}
} // namespace rootsig
} // namespace hlsl
} // namespace llvm
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 145ef10..e5a4e1e 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -404,6 +404,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
break;
case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
case CallingConv::AMDGPU_Gfx: Out << "amdgpu_gfx"; break;
+ case CallingConv::AMDGPU_Gfx_WholeWave:
+ Out << "amdgpu_gfx_whole_wave";
+ break;
case CallingConv::M68k_RTD: Out << "m68k_rtdcc"; break;
case CallingConv::RISCV_VectorCall:
Out << "riscv_vector_cc";
@@ -2398,8 +2401,9 @@ static void writeDIFile(raw_ostream &Out, const DIFile *N, AsmWriterContext &) {
// Print all values for checksum together, or not at all.
if (N->getChecksum())
Printer.printChecksum(*N->getChecksum());
- Printer.printString("source", N->getSource().value_or(StringRef()),
- /* ShouldSkipEmpty */ true);
+ if (N->getSource())
+ Printer.printString("source", *N->getSource(),
+ /* ShouldSkipEmpty */ false);
Out << ")";
}
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 86285a0..7159107 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1310,6 +1310,18 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
return true;
}
break;
+ case 'l':
+ if (Name.starts_with("lifetime.start") ||
+ Name.starts_with("lifetime.end")) {
+ // Unless remangling is required, do not upgrade the function declaration,
+ // but do upgrade the calls.
+ if (auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F))
+ NewFn = *Result;
+ else
+ NewFn = F;
+ return true;
+ }
+ break;
case 'm': {
// Updating the memory intrinsics (memcpy/memmove/memset) that have an
// alignment parameter to embedding the alignment as an attribute of
@@ -1438,6 +1450,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
.Case("popc.ll", true)
.Case("h2f", true)
.Case("swap.lo.hi.b64", true)
+ .Case("tanh.approx.f32", true)
.Default(false);
if (Expand) {
@@ -1629,7 +1642,6 @@ bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn,
NewFn = nullptr;
bool Upgraded =
upgradeIntrinsicFunction1(F, NewFn, CanUpgradeDebugIntrinsicsToRecords);
- assert(F != NewFn && "Intrinsic function upgraded to the same function");
// Upgrade intrinsic attributes. This does not change the function.
if (NewFn)
@@ -2532,6 +2544,12 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
MDNode *MD = MDNode::get(Builder.getContext(), {});
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
return LD;
+ } else if (Name == "tanh.approx.f32") {
+ // nvvm.tanh.approx.f32 -> afn llvm.tanh.f32
+ FastMathFlags FMF;
+ FMF.setApproxFunc();
+ Rep = Builder.CreateUnaryIntrinsic(Intrinsic::tanh, CI->getArgOperand(0),
+ FMF);
} else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") {
Value *Arg =
Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0);
@@ -4570,6 +4588,9 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
}
const auto &DefaultCase = [&]() -> void {
+ if (F == NewFn)
+ return;
+
if (CI->getFunctionType() == NewFn->getFunctionType()) {
// Handle generic mangling change.
assert(
@@ -5109,6 +5130,31 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
MTI->setSourceAlignment(Align->getMaybeAlignValue());
break;
}
+
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end: {
+ Value *Size = CI->getArgOperand(0);
+ Value *Ptr = CI->getArgOperand(1);
+ if (isa<AllocaInst>(Ptr)) {
+ DefaultCase();
+ return;
+ }
+
+ // Try to strip pointer casts, such that the lifetime works on an alloca.
+ Ptr = Ptr->stripPointerCasts();
+ if (isa<AllocaInst>(Ptr)) {
+ // Don't use NewFn, as we might have looked through an addrspacecast.
+ if (NewFn->getIntrinsicID() == Intrinsic::lifetime_start)
+ NewCall = Builder.CreateLifetimeStart(Ptr, cast<ConstantInt>(Size));
+ else
+ NewCall = Builder.CreateLifetimeEnd(Ptr, cast<ConstantInt>(Size));
+ break;
+ }
+
+ // Otherwise remove the lifetime marker.
+ CI->eraseFromParent();
+ return;
+ }
}
assert(NewCall && "Should have either set this variable or returned through "
"the default case");
@@ -5131,7 +5177,8 @@ void llvm::UpgradeCallsToIntrinsic(Function *F) {
UpgradeIntrinsicCall(CB, NewFn);
// Remove old function, no longer used, from the module.
- F->eraseFromParent();
+ if (F != NewFn)
+ F->eraseFromParent();
}
}
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 8fb33c3..ab8ecee 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -45,25 +45,6 @@ using namespace llvm;
using namespace llvm::at;
using namespace llvm::dwarf;
-TinyPtrVector<DbgDeclareInst *> llvm::findDbgDeclares(Value *V) {
- // This function is hot. Check whether the value has any metadata to avoid a
- // DenseMap lookup. This check is a bitfield datamember lookup.
- if (!V->isUsedByMetadata())
- return {};
- auto *L = ValueAsMetadata::getIfExists(V);
- if (!L)
- return {};
- auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L);
- if (!MDV)
- return {};
-
- TinyPtrVector<DbgDeclareInst *> Declares;
- for (User *U : MDV->users())
- if (auto *DDI = dyn_cast<DbgDeclareInst>(U))
- Declares.push_back(DDI);
-
- return Declares;
-}
TinyPtrVector<DbgVariableRecord *> llvm::findDVRDeclares(Value *V) {
// This function is hot. Check whether the value has any metadata to avoid a
// DenseMap lookup. This check is a bitfield datamember lookup.
@@ -98,42 +79,31 @@ TinyPtrVector<DbgVariableRecord *> llvm::findDVRValues(Value *V) {
return Values;
}
-template <typename IntrinsicT, bool DbgAssignAndValuesOnly>
+template <bool DbgAssignAndValuesOnly>
static void
-findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
- SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
+findDbgIntrinsics(Value *V,
+ SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
// This function is hot. Check whether the value has any metadata to avoid a
// DenseMap lookup.
if (!V->isUsedByMetadata())
return;
- LLVMContext &Ctx = V->getContext();
// TODO: If this value appears multiple times in a DIArgList, we should still
- // only add the owning DbgValueInst once; use this set to track ArgListUsers.
+ // only add the owning dbg.value once; use this set to track ArgListUsers.
// This behaviour can be removed when we can automatically remove duplicates.
// V will also appear twice in a dbg.assign if its used in the both the value
// and address components.
- SmallPtrSet<IntrinsicT *, 4> EncounteredIntrinsics;
SmallPtrSet<DbgVariableRecord *, 4> EncounteredDbgVariableRecords;
- /// Append IntrinsicT users of MetadataAsValue(MD).
- auto AppendUsers = [&Ctx, &EncounteredIntrinsics,
- &EncounteredDbgVariableRecords, &Result,
- DbgVariableRecords](Metadata *MD) {
- if (auto *MDV = MetadataAsValue::getIfExists(Ctx, MD)) {
- for (User *U : MDV->users())
- if (IntrinsicT *DVI = dyn_cast<IntrinsicT>(U))
- if (EncounteredIntrinsics.insert(DVI).second)
- Result.push_back(DVI);
- }
- if (!DbgVariableRecords)
- return;
+ /// Append users of MetadataAsValue(MD).
+ auto AppendUsers = [&EncounteredDbgVariableRecords,
+ &DbgVariableRecords](Metadata *MD) {
// Get DbgVariableRecords that use this as a single value.
if (LocalAsMetadata *L = dyn_cast<LocalAsMetadata>(MD)) {
for (DbgVariableRecord *DVR : L->getAllDbgVariableRecordUsers()) {
if (!DbgAssignAndValuesOnly || DVR->isDbgValue() || DVR->isDbgAssign())
if (EncounteredDbgVariableRecords.insert(DVR).second)
- DbgVariableRecords->push_back(DVR);
+ DbgVariableRecords.push_back(DVR);
}
}
};
@@ -142,29 +112,23 @@ findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
AppendUsers(L);
for (Metadata *AL : L->getAllArgListUsers()) {
AppendUsers(AL);
- if (!DbgVariableRecords)
- continue;
DIArgList *DI = cast<DIArgList>(AL);
for (DbgVariableRecord *DVR : DI->getAllDbgVariableRecordUsers())
if (!DbgAssignAndValuesOnly || DVR->isDbgValue() || DVR->isDbgAssign())
if (EncounteredDbgVariableRecords.insert(DVR).second)
- DbgVariableRecords->push_back(DVR);
+ DbgVariableRecords.push_back(DVR);
}
}
}
void llvm::findDbgValues(
- SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V,
- SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
- findDbgIntrinsics<DbgValueInst, /*DbgAssignAndValuesOnly=*/true>(
- DbgValues, V, DbgVariableRecords);
+ Value *V, SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
+ findDbgIntrinsics</*DbgAssignAndValuesOnly=*/true>(V, DbgVariableRecords);
}
void llvm::findDbgUsers(
- SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers, Value *V,
- SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
- findDbgIntrinsics<DbgVariableIntrinsic, /*DbgAssignAndValuesOnly=*/false>(
- DbgUsers, V, DbgVariableRecords);
+ Value *V, SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
+ findDbgIntrinsics</*DbgAssignAndValuesOnly=*/false>(V, DbgVariableRecords);
}
DISubprogram *llvm::getDISubprogram(const MDNode *Scope) {
@@ -173,18 +137,6 @@ DISubprogram *llvm::getDISubprogram(const MDNode *Scope) {
return nullptr;
}
-DebugLoc llvm::getDebugValueLoc(DbgVariableIntrinsic *DII) {
- // Original dbg.declare must have a location.
- const DebugLoc &DeclareLoc = DII->getDebugLoc();
- MDNode *Scope = DeclareLoc.getScope();
- DILocation *InlinedAt = DeclareLoc.getInlinedAt();
- // Because no machine insts can come from debug intrinsics, only the scope
- // and inlinedAt is significant. Zero line numbers are used in case this
- // DebugLoc leaks into any adjacent instructions. Produce an unknown location
- // with the correct scope / inlinedAt fields.
- return DILocation::get(DII->getContext(), 0, 0, Scope, InlinedAt);
-}
-
DebugLoc llvm::getDebugValueLoc(DbgVariableRecord *DVR) {
// Original dbg.declare must have a location.
const DebugLoc &DeclareLoc = DVR->getDebugLoc();
@@ -852,19 +804,6 @@ void DebugTypeInfoRemoval::traverse(MDNode *N) {
bool llvm::stripNonLineTableDebugInfo(Module &M) {
bool Changed = false;
- // First off, delete the debug intrinsics.
- auto RemoveUses = [&](StringRef Name) {
- if (auto *DbgVal = M.getFunction(Name)) {
- while (!DbgVal->use_empty())
- cast<Instruction>(DbgVal->user_back())->eraseFromParent();
- DbgVal->eraseFromParent();
- Changed = true;
- }
- };
- RemoveUses("llvm.dbg.declare");
- RemoveUses("llvm.dbg.label");
- RemoveUses("llvm.dbg.value");
-
// Delete non-CU debug info named metadata nodes.
for (auto NMI = M.named_metadata_begin(), NME = M.named_metadata_end();
NMI != NME;) {
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 7a03663..fc06745 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1232,6 +1232,7 @@ bool llvm::CallingConv::supportsNonVoidReturnType(CallingConv::ID CC) {
case CallingConv::AArch64_SVE_VectorCall:
case CallingConv::WASM_EmscriptenInvoke:
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
case CallingConv::M68k_INTR:
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
diff --git a/llvm/lib/IR/PassInstrumentation.cpp b/llvm/lib/IR/PassInstrumentation.cpp
index 94ad124..70bbe8f 100644
--- a/llvm/lib/IR/PassInstrumentation.cpp
+++ b/llvm/lib/IR/PassInstrumentation.cpp
@@ -23,6 +23,7 @@ template struct LLVM_EXPORT_TEMPLATE Any::TypeId<const Loop *>;
void PassInstrumentationCallbacks::addClassToPassName(StringRef ClassName,
StringRef PassName) {
+ assert(!PassName.empty() && "PassName can't be empty!");
ClassToPassName.try_emplace(ClassName, PassName.str());
}
@@ -33,7 +34,10 @@ PassInstrumentationCallbacks::getPassNameForClassName(StringRef ClassName) {
Fn();
ClassToPassNameCallbacks.clear();
}
- return ClassToPassName[ClassName];
+ auto PassNameIter = ClassToPassName.find(ClassName);
+ if (PassNameIter != ClassToPassName.end())
+ return PassNameIter->second;
+ return {};
}
AnalysisKey PassInstrumentationAnalysis::Key;
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 5e1bf28..9c34662 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -304,14 +304,12 @@ IntegerType *Type::getIntNTy(LLVMContext &C, unsigned N) {
Type *Type::getWasm_ExternrefTy(LLVMContext &C) {
// opaque pointer in addrspace(10)
- static PointerType *Ty = PointerType::get(C, 10);
- return Ty;
+ return PointerType::get(C, 10);
}
Type *Type::getWasm_FuncrefTy(LLVMContext &C) {
// opaque pointer in addrspace(20)
- static PointerType *Ty = PointerType::get(C, 20);
- return Ty;
+ return PointerType::get(C, 20);
}
//===----------------------------------------------------------------------===//
@@ -324,12 +322,12 @@ IntegerType *IntegerType::get(LLVMContext &C, unsigned NumBits) {
// Check for the built-in integer types
switch (NumBits) {
- case 1: return cast<IntegerType>(Type::getInt1Ty(C));
- case 8: return cast<IntegerType>(Type::getInt8Ty(C));
- case 16: return cast<IntegerType>(Type::getInt16Ty(C));
- case 32: return cast<IntegerType>(Type::getInt32Ty(C));
- case 64: return cast<IntegerType>(Type::getInt64Ty(C));
- case 128: return cast<IntegerType>(Type::getInt128Ty(C));
+ case 1: return Type::getInt1Ty(C);
+ case 8: return Type::getInt8Ty(C);
+ case 16: return Type::getInt16Ty(C);
+ case 32: return Type::getInt32Ty(C);
+ case 64: return Type::getInt64Ty(C);
+ case 128: return Type::getInt128Ty(C);
default:
break;
}
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 02c16e2..129ca4a 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -582,16 +582,11 @@ void Value::replaceUsesWithIf(Value *New,
}
}
-/// Replace llvm.dbg.* uses of MetadataAsValue(ValueAsMetadata(V)) outside BB
+/// Replace debug record uses of MetadataAsValue(ValueAsMetadata(V)) outside BB
/// with New.
static void replaceDbgUsesOutsideBlock(Value *V, Value *New, BasicBlock *BB) {
- SmallVector<DbgVariableIntrinsic *> DbgUsers;
SmallVector<DbgVariableRecord *> DPUsers;
- findDbgUsers(DbgUsers, V, &DPUsers);
- for (auto *DVI : DbgUsers) {
- if (DVI->getParent() != BB)
- DVI->replaceVariableLocationOp(V, New);
- }
+ findDbgUsers(V, DPUsers);
for (auto *DVR : DPUsers) {
DbgMarker *Marker = DVR->getMarker();
if (Marker->getParent() != BB)
@@ -752,28 +747,34 @@ const Value *Value::stripAndAccumulateConstantOffsets(
// means when we construct GEPOffset, we need to use the size
// of GEP's pointer type rather than the size of the original
// pointer type.
- APInt GEPOffset(DL.getIndexTypeSizeInBits(V->getType()), 0);
- if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis))
- return V;
-
- // Stop traversal if the pointer offset wouldn't fit in the bit-width
- // provided by the Offset argument. This can happen due to AddrSpaceCast
- // stripping.
- if (GEPOffset.getSignificantBits() > BitWidth)
- return V;
-
- // External Analysis can return a result higher/lower than the value
- // represents. We need to detect overflow/underflow.
- APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth);
- if (!ExternalAnalysis) {
- Offset += GEPOffsetST;
+ unsigned CurBitWidth = DL.getIndexTypeSizeInBits(V->getType());
+ if (CurBitWidth == BitWidth) {
+ if (!GEP->accumulateConstantOffset(DL, Offset, ExternalAnalysis))
+ return V;
} else {
- bool Overflow = false;
- APInt OldOffset = Offset;
- Offset = Offset.sadd_ov(GEPOffsetST, Overflow);
- if (Overflow) {
- Offset = OldOffset;
+ APInt GEPOffset(CurBitWidth, 0);
+ if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis))
+ return V;
+
+ // Stop traversal if the pointer offset wouldn't fit in the bit-width
+ // provided by the Offset argument. This can happen due to AddrSpaceCast
+ // stripping.
+ if (GEPOffset.getSignificantBits() > BitWidth)
return V;
+
+ // External Analysis can return a result higher/lower than the value
+ // represents. We need to detect overflow/underflow.
+ APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth);
+ if (!ExternalAnalysis) {
+ Offset += GEPOffsetST;
+ } else {
+ bool Overflow = false;
+ APInt OldOffset = Offset;
+ Offset = Offset.sadd_ov(GEPOffsetST, Overflow);
+ if (Overflow) {
+ Offset = OldOffset;
+ return V;
+ }
}
}
V = GEP->getPointerOperand();
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 9bd573e..3ff9895 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2979,6 +2979,16 @@ void Verifier::visitFunction(const Function &F) {
"perfect forwarding!",
&F);
break;
+ case CallingConv::AMDGPU_Gfx_WholeWave:
+ Check(!F.arg_empty() && F.arg_begin()->getType()->isIntegerTy(1),
+ "Calling convention requires first argument to be i1", &F);
+ Check(!F.arg_begin()->hasInRegAttr(),
+ "Calling convention requires first argument to not be inreg", &F);
+ Check(!F.isVarArg(),
+ "Calling convention does not support varargs or "
+ "perfect forwarding!",
+ &F);
+ break;
}
// Check that the argument values match the function type for this function...
@@ -6658,6 +6668,54 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"invalid vector type for format", &Call, Src1, Call.getArgOperand(5));
break;
}
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
+ Value *Src0 = Call.getArgOperand(1);
+ Value *Src1 = Call.getArgOperand(3);
+
+ unsigned FmtA = cast<ConstantInt>(Call.getArgOperand(0))->getZExtValue();
+ unsigned FmtB = cast<ConstantInt>(Call.getArgOperand(2))->getZExtValue();
+ Check(FmtA <= 4, "invalid value for matrix format", Call,
+ Call.getArgOperand(0));
+ Check(FmtB <= 4, "invalid value for matrix format", Call,
+ Call.getArgOperand(2));
+
+ // AMDGPU::MatrixFMT values
+ auto getFormatNumRegs = [](unsigned FormatVal) {
+ switch (FormatVal) {
+ case 0:
+ case 1:
+ return 16u;
+ case 2:
+ case 3:
+ return 12u;
+ case 4:
+ return 8u;
+ default:
+ llvm_unreachable("invalid format value");
+ }
+ };
+
+ auto isValidSrcASrcBVector = [](FixedVectorType *Ty) {
+ if (!Ty || !Ty->getElementType()->isIntegerTy(32))
+ return false;
+ unsigned NumElts = Ty->getNumElements();
+ return NumElts == 16 || NumElts == 12 || NumElts == 8;
+ };
+
+ auto *Src0Ty = dyn_cast<FixedVectorType>(Src0->getType());
+ auto *Src1Ty = dyn_cast<FixedVectorType>(Src1->getType());
+ Check(isValidSrcASrcBVector(Src0Ty),
+ "operand 1 must be 8, 12 or 16 element i32 vector", &Call, Src0);
+ Check(isValidSrcASrcBVector(Src1Ty),
+ "operand 3 must be 8, 12 or 16 element i32 vector", &Call, Src1);
+
+ // Permit excess registers for the format.
+ Check(Src0Ty->getNumElements() >= getFormatNumRegs(FmtA),
+ "invalid vector type for format", &Call, Src0, Call.getArgOperand(0));
+ Check(Src1Ty->getNumElements() >= getFormatNumRegs(FmtB),
+ "invalid vector type for format", &Call, Src1, Call.getArgOperand(2));
+ break;
+ }
case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: {
Value *V = Call.getArgOperand(0);
@@ -6710,6 +6768,11 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"llvm.threadlocal.address operand isThreadLocal() must be true");
break;
}
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ Check(isa<AllocaInst>(Call.getArgOperand(1)),
+ "llvm.lifetime.start/end can only be used on alloca", &Call);
+ break;
};
// Verify that there aren't any unmediated control transfers between funclets.
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 67c53e0..7119ef4 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -345,7 +345,7 @@ public:
void emitIdent(StringRef IdentString) override;
void emitCFIBKeyFrame() override;
void emitCFIMTETaggedFrame() override;
- void emitCFISections(bool EH, bool Debug) override;
+ void emitCFISections(bool EH, bool Debug, bool SFrame) override;
void emitCFIDefCfa(int64_t Register, int64_t Offset, SMLoc Loc) override;
void emitCFIDefCfaOffset(int64_t Offset, SMLoc Loc) override;
void emitCFIDefCfaRegister(int64_t Register, SMLoc Loc) override;
@@ -1906,15 +1906,24 @@ void MCAsmStreamer::emitIdent(StringRef IdentString) {
EmitEOL();
}
-void MCAsmStreamer::emitCFISections(bool EH, bool Debug) {
- MCStreamer::emitCFISections(EH, Debug);
+void MCAsmStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) {
+ MCStreamer::emitCFISections(EH, Debug, SFrame);
OS << "\t.cfi_sections ";
+ bool C = false;
if (EH) {
OS << ".eh_frame";
- if (Debug)
- OS << ", .debug_frame";
- } else if (Debug) {
+ C = true;
+ }
+ if (Debug) {
+ if (C)
+ OS << ", ";
OS << ".debug_frame";
+ C = true;
+ }
+ if (SFrame) {
+ if (C)
+ OS << ", ";
+ OS << ".sframe";
}
EmitEOL();
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index 9420924..e142ac1 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -8,7 +8,6 @@
#include "llvm/MC/MCAssembler.h"
#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
@@ -196,6 +195,7 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const {
switch (F.getKind()) {
case MCFragment::FT_Data:
case MCFragment::FT_Relaxable:
+ case MCFragment::FT_Align:
case MCFragment::FT_LEB:
case MCFragment::FT_Dwarf:
case MCFragment::FT_DwarfFrame:
@@ -226,28 +226,6 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const {
case MCFragment::FT_SymbolId:
return 4;
- case MCFragment::FT_Align: {
- const MCAlignFragment &AF = cast<MCAlignFragment>(F);
- unsigned Offset = getFragmentOffset(AF);
- unsigned Size = offsetToAlignment(Offset, AF.getAlignment());
-
- // Insert extra Nops for code alignment if the target define
- // shouldInsertExtraNopBytesForCodeAlign target hook.
- if (AF.getParent()->useCodeAlign() && AF.hasEmitNops() &&
- getBackend().shouldInsertExtraNopBytesForCodeAlign(AF, Size))
- return Size;
-
- // If we are padding with nops, force the padding to be larger than the
- // minimum nop size.
- if (Size > 0 && AF.hasEmitNops()) {
- while (Size % getBackend().getMinimumNopSize())
- Size += AF.getAlignment().value();
- }
- if (Size > AF.getMaxBytesToEmit())
- return 0;
- return Size;
- }
-
case MCFragment::FT_Org: {
const MCOrgFragment &OF = cast<MCOrgFragment>(F);
MCValue Value;
@@ -383,7 +361,7 @@ uint64_t MCAssembler::getSectionAddressSize(const MCSection &Sec) const {
uint64_t MCAssembler::getSectionFileSize(const MCSection &Sec) const {
// Virtual sections have no file size.
- if (Sec.isVirtualSection())
+ if (Sec.isBssSection())
return 0;
return getSectionAddressSize(Sec);
}
@@ -431,48 +409,45 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
const auto &EF = cast<MCFragment>(F);
OS << StringRef(EF.getContents().data(), EF.getContents().size());
OS << StringRef(EF.getVarContents().data(), EF.getVarContents().size());
- break;
- }
+ } break;
+
case MCFragment::FT_Align: {
++stats::EmittedAlignFragments;
- const MCAlignFragment &AF = cast<MCAlignFragment>(F);
- assert(AF.getFillLen() && "Invalid virtual align in concrete fragment!");
+ OS << StringRef(F.getContents().data(), F.getContents().size());
+ assert(F.getAlignFillLen() &&
+ "Invalid virtual align in concrete fragment!");
- uint64_t Count = FragmentSize / AF.getFillLen();
- assert(FragmentSize % AF.getFillLen() == 0 &&
+ uint64_t Count = (FragmentSize - F.getFixedSize()) / F.getAlignFillLen();
+ assert((FragmentSize - F.getFixedSize()) % F.getAlignFillLen() == 0 &&
"computeFragmentSize computed size is incorrect");
- // See if we are aligning with nops, and if so do that first to try to fill
- // the Count bytes. Then if that did not fill any bytes or there are any
- // bytes left to fill use the Value and ValueSize to fill the rest.
- // If we are aligning with nops, ask that target to emit the right data.
- if (AF.hasEmitNops()) {
- if (!Asm.getBackend().writeNopData(OS, Count, AF.getSubtargetInfo()))
- report_fatal_error("unable to write nop sequence of " +
- Twine(Count) + " bytes");
- break;
- }
-
- // Otherwise, write out in multiples of the value size.
- for (uint64_t i = 0; i != Count; ++i) {
- switch (AF.getFillLen()) {
- default: llvm_unreachable("Invalid size!");
- case 1:
- OS << char(AF.getFill());
- break;
- case 2:
- support::endian::write<uint16_t>(OS, AF.getFill(), Endian);
- break;
- case 4:
- support::endian::write<uint32_t>(OS, AF.getFill(), Endian);
- break;
- case 8:
- support::endian::write<uint64_t>(OS, AF.getFill(), Endian);
- break;
+ // In the nops mode, call the backend hook to write `Count` nops.
+ if (F.hasAlignEmitNops()) {
+ if (!Asm.getBackend().writeNopData(OS, Count, F.getSubtargetInfo()))
+ reportFatalInternalError("unable to write nop sequence of " +
+ Twine(Count) + " bytes");
+ } else {
+ // Otherwise, write out in multiples of the value size.
+ for (uint64_t i = 0; i != Count; ++i) {
+ switch (F.getAlignFillLen()) {
+ default:
+ llvm_unreachable("Invalid size!");
+ case 1:
+ OS << char(F.getAlignFill());
+ break;
+ case 2:
+ support::endian::write<uint16_t>(OS, F.getAlignFill(), Endian);
+ break;
+ case 4:
+ support::endian::write<uint32_t>(OS, F.getAlignFill(), Endian);
+ break;
+ case 8:
+ support::endian::write<uint64_t>(OS, F.getAlignFill(), Endian);
+ break;
+ }
}
}
- break;
- }
+ } break;
case MCFragment::FT_Fill: {
++stats::EmittedFillFragments;
@@ -583,42 +558,45 @@ void MCAssembler::writeSectionData(raw_ostream &OS,
const MCSection *Sec) const {
assert(getBackendPtr() && "Expected assembler backend");
- // Ignore virtual sections.
- if (Sec->isVirtualSection()) {
+ if (Sec->isBssSection()) {
assert(getSectionFileSize(*Sec) == 0 && "Invalid size for section!");
- // Check that contents are only things legal inside a virtual section.
+ // Ensure no fixups or non-zero bytes are written to BSS sections, catching
+ // errors in both input assembly code and MCStreamer API usage. Location is
+ // not tracked for efficiency.
+ auto Fn = [](char c) { return c != 0; };
for (const MCFragment &F : *Sec) {
+ bool HasNonZero = false;
switch (F.getKind()) {
- default: llvm_unreachable("Invalid fragment in virtual section!");
- case MCFragment::FT_Data: {
- // Check that we aren't trying to write a non-zero contents (or fixups)
- // into a virtual section. This is to support clients which use standard
- // directives to fill the contents of virtual sections.
- if (F.getFixups().size() || F.getVarFixups().size())
- reportError(SMLoc(), Sec->getVirtualSectionKind() + " section '" +
- Sec->getName() + "' cannot have fixups");
- for (char C : F.getContents())
- if (C) {
- reportError(SMLoc(), Sec->getVirtualSectionKind() + " section '" +
- Sec->getName() +
- "' cannot have non-zero initializers");
- break;
- }
+ default:
+ reportFatalInternalError("BSS section '" + Sec->getName() +
+ "' contains invalid fragment");
+ break;
+ case MCFragment::FT_Data:
+ case MCFragment::FT_Relaxable:
+ HasNonZero =
+ any_of(F.getContents(), Fn) || any_of(F.getVarContents(), Fn);
break;
- }
case MCFragment::FT_Align:
- // Check that we aren't trying to write a non-zero value into a virtual
- // section.
- assert((cast<MCAlignFragment>(F).getFillLen() == 0 ||
- cast<MCAlignFragment>(F).getFill() == 0) &&
- "Invalid align in virtual section!");
+ // Disallowed for API usage. AsmParser changes non-zero fill values to
+ // 0.
+ assert(F.getAlignFill() == 0 && "Invalid align in virtual section!");
break;
case MCFragment::FT_Fill:
- assert((cast<MCFillFragment>(F).getValue() == 0) &&
- "Invalid fill in virtual section!");
+ HasNonZero = cast<MCFillFragment>(F).getValue() != 0;
break;
case MCFragment::FT_Org:
+ HasNonZero = cast<MCOrgFragment>(F).getValue() != 0;
+ break;
+ }
+ if (HasNonZero) {
+ reportError(SMLoc(), "BSS section '" + Sec->getName() +
+ "' cannot have non-zero bytes");
+ break;
+ }
+ if (F.getFixups().size() || F.getVarFixups().size()) {
+ reportError(SMLoc(),
+ "BSS section '" + Sec->getName() + "' cannot have fixups");
break;
}
}
@@ -722,34 +700,25 @@ void MCAssembler::layout() {
for (MCSection &Sec : *this) {
for (MCFragment &F : Sec) {
// Process fragments with fixups here.
- if (F.isEncoded()) {
- auto Contents = F.getContents();
- for (MCFixup &Fixup : F.getFixups()) {
+ auto Contents = F.getContents();
+ for (MCFixup &Fixup : F.getFixups()) {
+ uint64_t FixedValue;
+ MCValue Target;
+ evaluateFixup(F, Fixup, Target, FixedValue,
+ /*RecordReloc=*/true, Contents);
+ }
+ if (F.getVarFixups().size()) {
+ // In the variable part, fixup offsets are relative to the fixed part's
+ // start. Extend the variable contents to the left to account for the
+ // fixed part size.
+ Contents = MutableArrayRef(F.getParent()->ContentStorage)
+ .slice(F.VarContentStart - Contents.size(), F.getSize());
+ for (MCFixup &Fixup : F.getVarFixups()) {
uint64_t FixedValue;
MCValue Target;
evaluateFixup(F, Fixup, Target, FixedValue,
/*RecordReloc=*/true, Contents);
}
- // In the variable part, fixup offsets are relative to the fixed part's
- // start. Extend the variable contents to the left to account for the
- // fixed part size.
- auto VarFixups = F.getVarFixups();
- if (VarFixups.size()) {
- Contents =
- MutableArrayRef(F.getParent()->ContentStorage)
- .slice(F.VarContentStart - Contents.size(), F.getSize());
- for (MCFixup &Fixup : VarFixups) {
- uint64_t FixedValue;
- MCValue Target;
- evaluateFixup(F, Fixup, Target, FixedValue,
- /*RecordReloc=*/true, Contents);
- }
- }
- } else if (auto *AF = dyn_cast<MCAlignFragment>(&F)) {
- // For RISC-V linker relaxation, an alignment relocation might be
- // needed.
- if (AF->hasEmitNops())
- getBackend().shouldInsertFixupForCodeAlign(*this, *AF);
}
}
}
@@ -953,15 +922,15 @@ bool MCAssembler::relaxDwarfCallFrameFragment(MCFragment &F) {
}
bool MCAssembler::relaxCVInlineLineTable(MCCVInlineLineTableFragment &F) {
- unsigned OldSize = F.getContents().size();
+ unsigned OldSize = F.getVarContents().size();
getContext().getCVContext().encodeInlineLineTable(*this, F);
- return OldSize != F.getContents().size();
+ return OldSize != F.getVarContents().size();
}
bool MCAssembler::relaxCVDefRange(MCCVDefRangeFragment &F) {
- unsigned OldSize = F.getContents().size();
+ unsigned OldSize = F.getVarContents().size();
getContext().getCVContext().encodeDefRange(*this, F);
- return OldSize != F.getContents().size();
+ return OldSize != F.getVarContents().size();
}
bool MCAssembler::relaxFill(MCFillFragment &F) {
@@ -1000,7 +969,32 @@ void MCAssembler::layoutSection(MCSection &Sec) {
uint64_t Offset = 0;
for (MCFragment &F : Sec) {
F.Offset = Offset;
- Offset += computeFragmentSize(F);
+ if (F.getKind() == MCFragment::FT_Align) {
+ Offset += F.getFixedSize();
+ unsigned Size = offsetToAlignment(Offset, F.getAlignment());
+ // In the nops mode, RISC-V style linker relaxation might adjust the size
+ // and add a fixup, even if `Size` is originally 0.
+ bool AlignFixup = false;
+ if (F.hasAlignEmitNops()) {
+ AlignFixup = getBackend().relaxAlign(F, Size);
+ // If the backend does not handle the fragment specially, pad with nops,
+ // but ensure that the padding is larger than the minimum nop size.
+ if (!AlignFixup)
+ while (Size % getBackend().getMinimumNopSize())
+ Size += F.getAlignment().value();
+ }
+ if (!AlignFixup && Size > F.getAlignMaxBytesToEmit())
+ Size = 0;
+ // Update the variable tail size, offset by FixedSize to prevent ubsan
+ // pointer-overflow in evaluateFixup. The content is ignored.
+ F.VarContentStart = F.getFixedSize();
+ F.VarContentEnd = F.VarContentStart + Size;
+ if (F.VarContentEnd > F.getParent()->ContentStorage.size())
+ F.getParent()->ContentStorage.resize(F.VarContentEnd);
+ Offset += Size;
+ } else {
+ Offset += computeFragmentSize(F);
+ }
}
}
diff --git a/llvm/lib/MC/MCCodeView.cpp b/llvm/lib/MC/MCCodeView.cpp
index 1f98251..7d528a5 100644
--- a/llvm/lib/MC/MCCodeView.cpp
+++ b/llvm/lib/MC/MCCodeView.cpp
@@ -26,8 +26,10 @@ using namespace llvm;
using namespace llvm::codeview;
void CodeViewContext::finish() {
- if (StrTabFragment)
- StrTabFragment->setContents(StrTab);
+ if (!StrTabFragment)
+ return;
+ assert(StrTabFragment->getKind() == MCFragment::FT_Data);
+ StrTabFragment->setVarContents(StrTab);
}
/// This is a valid number for use with .cv_loc if we've already seen a .cv_file
@@ -166,8 +168,9 @@ void CodeViewContext::emitStringTable(MCObjectStreamer &OS) {
// somewhere else. If somebody wants two string tables in their .s file, one
// will just be empty.
if (!StrTabFragment) {
- StrTabFragment = Ctx.allocFragment<MCFragment>();
- OS.insert(StrTabFragment);
+ OS.newFragment();
+ StrTabFragment = OS.getCurrentFragment();
+ OS.newFragment();
}
OS.emitValueToAlignment(Align(4), 0);
@@ -603,7 +606,7 @@ void CodeViewContext::encodeInlineLineTable(const MCAssembler &Asm,
compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeLength, Buffer);
compressAnnotation(std::min(EndSymLength, LocAfterLength), Buffer);
- Frag.setContents(Buffer);
+ Frag.setVarContents(Buffer);
}
void CodeViewContext::encodeDefRange(const MCAssembler &Asm,
@@ -691,6 +694,6 @@ void CodeViewContext::encodeDefRange(const MCAssembler &Asm,
}
}
- Frag.setContents(Contents);
- Frag.setFixups(Fixups);
+ Frag.setVarContents(Contents);
+ Frag.setVarFixups(Fixups);
}
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index b1dced7..e7c0d37 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -447,10 +447,17 @@ static void emitOneV5FileEntry(MCStreamer *MCOS, const MCDwarfFile &DwarfFile,
StringRef(reinterpret_cast<const char *>(Cksum.data()), Cksum.size()));
}
if (HasAnySource) {
+ // From https://dwarfstd.org/issues/180201.1.html
+ // * The value is an empty null-terminated string if no source is available
+ StringRef Source = DwarfFile.Source.value_or(StringRef());
+ // * If the source is available but is an empty file then the value is a
+ // null-terminated single "\n".
+ if (DwarfFile.Source && DwarfFile.Source->empty())
+ Source = "\n";
if (LineStr)
- LineStr->emitRef(MCOS, DwarfFile.Source.value_or(StringRef()));
+ LineStr->emitRef(MCOS, Source);
else {
- MCOS->emitBytes(DwarfFile.Source.value_or(StringRef())); // Source and...
+ MCOS->emitBytes(Source); // Source and...
MCOS->emitBytes(StringRef("\0", 1)); // its null terminator.
}
}
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index 49071bd..b8cbaea5 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -88,7 +88,7 @@ void MCELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
if (SectionELF->getFlags() & ELF::SHF_GNU_RETAIN)
getWriter().markGnuAbi();
- changeSectionImpl(Section, Subsection);
+ MCObjectStreamer::changeSection(Section, Subsection);
Asm.registerSymbol(*Section->getBeginSymbol());
}
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 22dff49..dbb2fd1 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -370,7 +370,6 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
}
int64_t Num;
- unsigned Count;
if (DF) {
Displacement += DF->getContents().size();
} else if (F->getKind() == MCFragment::FT_Relaxable &&
@@ -379,11 +378,9 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
// After layout, during relocation generation, it can be treated as a
// data fragment.
Displacement += F->getSize();
- } else if (auto *AF = dyn_cast<MCAlignFragment>(F);
- AF && Layout && AF->hasEmitNops() &&
- !Asm->getBackend().shouldInsertExtraNopBytesForCodeAlign(
- *AF, Count)) {
- Displacement += Asm->computeFragmentSize(*AF);
+ } else if (F->getKind() == MCFragment::FT_Align && Layout &&
+ F->isLinkerRelaxable()) {
+ Displacement += Asm->computeFragmentSize(*F);
} else if (auto *FF = dyn_cast<MCFillFragment>(F);
FF && FF->getNumValues().evaluateAsAbsolute(Num)) {
Displacement += Num * FF->getValueSize();
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index fe7afd4..3c395e5 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -72,17 +72,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
};
switch (getKind()) {
- case MCFragment::FT_Align: {
- const auto *AF = cast<MCAlignFragment>(this);
- OS << " Align:" << AF->getAlignment().value() << " Fill:" << AF->getFill()
- << " FillLen:" << unsigned(AF->getFillLen())
- << " MaxBytesToEmit:" << AF->getMaxBytesToEmit();
- if (AF->hasEmitNops())
- OS << " Nops";
- break;
- }
case MCFragment::FT_Data:
case MCFragment::FT_Relaxable:
+ case MCFragment::FT_Align:
case MCFragment::FT_LEB:
case MCFragment::FT_Dwarf:
case MCFragment::FT_DwarfFrame: {
@@ -91,8 +83,13 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
auto Fixed = getContents();
auto Var = getVarContents();
OS << " Size:" << Fixed.size();
- if (getKind() != MCFragment::FT_Data)
+ if (getKind() != MCFragment::FT_Data) {
OS << '+' << Var.size();
+ // FT_Align uses getVarContents to track the size, but the content is
+ // ignored and not useful.
+ if (getKind() == MCFragment::FT_Align)
+ Var = {};
+ }
OS << " [";
for (unsigned i = 0, e = Fixed.size(); i != e; ++i) {
if (i) OS << ",";
@@ -111,6 +108,13 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
OS << ' ';
getInst().dump_pretty(OS);
break;
+ case MCFragment::FT_Align:
+ OS << "\n Align:" << getAlignment().value() << " Fill:" << getAlignFill()
+ << " FillLen:" << unsigned(getAlignFillLen())
+ << " MaxBytesToEmit:" << getAlignMaxBytesToEmit();
+ if (hasAlignEmitNops())
+ OS << " Nops";
+ break;
case MCFragment::FT_LEB: {
OS << " Value:";
getLEBValue().print(OS, nullptr);
diff --git a/llvm/lib/MC/MCGOFFStreamer.cpp b/llvm/lib/MC/MCGOFFStreamer.cpp
index b702191..1718e2a 100644
--- a/llvm/lib/MC/MCGOFFStreamer.cpp
+++ b/llvm/lib/MC/MCGOFFStreamer.cpp
@@ -26,19 +26,15 @@ GOFFObjectWriter &MCGOFFStreamer::getWriter() {
return static_cast<GOFFObjectWriter &>(getAssembler().getWriter());
}
-// Make sure that all section are registered in the correct order.
-static void registerSectionHierarchy(MCAssembler &Asm, MCSectionGOFF *Section) {
- if (Section->isRegistered())
- return;
- if (Section->getParent())
- registerSectionHierarchy(Asm, Section->getParent());
- Asm.registerSection(*Section);
-}
-
void MCGOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
- registerSectionHierarchy(getAssembler(),
- static_cast<MCSectionGOFF *>(Section));
- MCObjectStreamer::changeSection(Section, Subsection);
+ // Make sure that all section are registered in the correct order.
+ SmallVector<MCSectionGOFF *> Sections;
+ for (auto *S = static_cast<MCSectionGOFF *>(Section); S; S = S->getParent())
+ Sections.push_back(S);
+ while (!Sections.empty()) {
+ auto *S = Sections.pop_back_val();
+ MCObjectStreamer::changeSection(S, Sections.empty() ? Subsection : 0);
+ }
}
MCStreamer *llvm::createGOFFStreamer(MCContext &Context,
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 08d2b93..8c3332c 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/BinaryFormat/MachO.h"
@@ -132,8 +131,7 @@ public:
} // end anonymous namespace.
void MCMachOStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
- // Change the section normally.
- changeSectionImpl(Section, Subsection);
+ MCObjectStreamer::changeSection(Section, Subsection);
// Output a linker-local symbol so we don't need section-relative local
// relocations. The linker hates us when we do that.
@@ -393,7 +391,7 @@ void MCMachOStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
// On darwin all virtual sections have zerofill type. Disallow the usage of
// .zerofill in non-virtual functions. If something similar is needed, use
// .space or .zero.
- if (!Section->isVirtualSection()) {
+ if (!Section->isBssSection()) {
getContext().reportError(
Loc, "The usage of .zerofill is restricted to sections of "
"ZEROFILL type. Use .zero or .space instead.");
@@ -479,7 +477,8 @@ void MCMachOStreamer::finalizeCGProfile() {
// and set its size now so that it's accounted for in layout.
MCSection *CGProfileSection = Asm.getContext().getMachOSection(
"__LLVM", "__cg_profile", 0, SectionKind::getMetadata());
- changeSection(CGProfileSection);
+ // Call the base class changeSection to omit the linker-local label.
+ MCObjectStreamer::changeSection(CGProfileSection);
// For each entry, reserve space for 2 32-bit indices and a 64-bit count.
size_t SectionBytes =
W.getCGProfile().size() * (2 * sizeof(uint32_t) + sizeof(uint64_t));
@@ -511,7 +510,8 @@ void MCMachOStreamer::createAddrSigSection() {
// to be computed immediately after in order for it to be exported correctly.
MCSection *AddrSigSection =
Asm.getContext().getObjectFileInfo()->getAddrSigSection();
- changeSection(AddrSigSection);
+ // Call the base class changeSection to omit the linker-local label.
+ MCObjectStreamer::changeSection(AddrSigSection);
auto *Frag = cast<MCFragment>(AddrSigSection->curFragList()->Head);
// We will generate a series of pointer-sized symbol relocations at offset
// 0x0. Set the section size to be large enough to contain a single pointer
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index f61dda6..fcd5cbf 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -19,7 +19,6 @@
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/SourceMgr.h"
using namespace llvm;
@@ -33,6 +32,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context,
Context, std::move(TAB), std::move(Emitter), std::move(OW))),
EmitEHFrame(true), EmitDebugFrame(false) {
assert(Assembler->getBackendPtr() && Assembler->getEmitterPtr());
+ IsObj = true;
setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
if (Context.getTargetOptions() && Context.getTargetOptions()->MCRelaxAll)
Assembler->setRelaxAll(true);
@@ -46,6 +46,25 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() {
return nullptr;
}
+void MCObjectStreamer::newFragment() {
+ addFragment(getContext().allocFragment<MCFragment>());
+}
+
+void MCObjectStreamer::insert(MCFragment *F) {
+ assert(F->getKind() != MCFragment::FT_Data &&
+ "F should have a variable-size tail");
+ addFragment(F);
+ newFragment();
+}
+
+void MCObjectStreamer::appendContents(size_t Num, char Elt) {
+ CurFrag->appendContents(Num, Elt);
+}
+
+void MCObjectStreamer::addFixup(const MCExpr *Value, MCFixupKind Kind) {
+ CurFrag->addFixup(MCFixup::create(CurFrag->getFixedSize(), Value, Kind));
+}
+
// As a compile-time optimization, avoid allocating and evaluating an MCExpr
// tree for (Hi - Lo) when Hi and Lo are offsets into the same fragment's fixed
// part.
@@ -106,32 +125,21 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) {
MCDwarfFrameEmitter::Emit(*this, MAB, false);
}
-MCFragment *MCObjectStreamer::getOrCreateDataFragment() {
- // TODO: Start a new fragment whenever finalizing the variable-size tail of a
- // previous one, so that all getOrCreateDataFragment calls can be replaced
- // with getCurrentFragment
- auto *F = getCurrentFragment();
- if (F->getKind() != MCFragment::FT_Data) {
- F = getContext().allocFragment<MCFragment>();
- insert(F);
- }
- return F;
-}
-
void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) {
Assembler->registerSymbol(Sym);
}
-void MCObjectStreamer::emitCFISections(bool EH, bool Debug) {
- MCStreamer::emitCFISections(EH, Debug);
+void MCObjectStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) {
+ MCStreamer::emitCFISections(EH, Debug, SFrame);
EmitEHFrame = EH;
EmitDebugFrame = Debug;
+ EmitSFrame = SFrame;
}
void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
SMLoc Loc) {
MCStreamer::emitValueImpl(Value, Size, Loc);
- MCFragment *DF = getOrCreateDataFragment();
+ MCFragment *DF = getCurrentFragment();
MCDwarfLineEntry::make(this, getCurrentSectionOnly());
@@ -180,7 +188,7 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
// If there is a current fragment, mark the symbol as pointing into it.
// Otherwise queue the label and set its fragment pointer when we emit the
// next fragment.
- MCFragment *F = getOrCreateDataFragment();
+ MCFragment *F = getCurrentFragment();
Symbol->setFragment(F);
Symbol->setOffset(F->getContents().size());
@@ -214,7 +222,7 @@ void MCObjectStreamer::emitULEB128Value(const MCExpr *Value) {
emitULEB128IntValue(IntValue);
return;
}
- auto *F = getOrCreateDataFragment();
+ auto *F = getCurrentFragment();
F->makeLEB(false, Value);
newFragment();
}
@@ -225,7 +233,7 @@ void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) {
emitSLEB128IntValue(IntValue);
return;
}
- auto *F = getOrCreateDataFragment();
+ auto *F = getCurrentFragment();
F->makeLEB(true, Value);
newFragment();
}
@@ -236,11 +244,6 @@ void MCObjectStreamer::emitWeakReference(MCSymbol *Alias,
}
void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
- changeSectionImpl(Section, Subsection);
-}
-
-bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
- uint32_t Subsection) {
assert(Section && "Cannot switch to a null section!");
getContext().clearDwarfLocSeen();
@@ -259,7 +262,7 @@ bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
Section->CurFragList = &Subsections[I].second;
CurFrag = Section->CurFragList->Tail;
- return getAssembler().registerSection(*Section);
+ getAssembler().registerSection(*Section);
}
void MCObjectStreamer::switchSectionNoPrint(MCSection *Section) {
@@ -291,18 +294,6 @@ bool MCObjectStreamer::mayHaveInstructions(MCSection &Sec) const {
void MCObjectStreamer::emitInstruction(const MCInst &Inst,
const MCSubtargetInfo &STI) {
- const MCSection &Sec = *getCurrentSectionOnly();
- if (Sec.isVirtualSection()) {
- getContext().reportError(Inst.getLoc(), Twine(Sec.getVirtualSectionKind()) +
- " section '" + Sec.getName() +
- "' cannot have instructions");
- return;
- }
- emitInstructionImpl(Inst, STI);
-}
-
-void MCObjectStreamer::emitInstructionImpl(const MCInst &Inst,
- const MCSubtargetInfo &STI) {
MCStreamer::emitInstruction(Inst, STI);
MCSection *Sec = getCurrentSectionOnly();
@@ -336,7 +327,7 @@ void MCObjectStreamer::emitInstructionImpl(const MCInst &Inst,
void MCObjectStreamer::emitInstToData(const MCInst &Inst,
const MCSubtargetInfo &STI) {
- MCFragment *F = getOrCreateDataFragment();
+ MCFragment *F = getCurrentFragment();
// Append the instruction to the data fragment.
size_t FixupStartIndex = F->getFixups().size();
@@ -368,7 +359,7 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst,
void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
const MCSubtargetInfo &STI) {
- auto *F = getOrCreateDataFragment();
+ auto *F = getCurrentFragment();
SmallVector<char, 16> Data;
SmallVector<MCFixup, 1> Fixups;
getAssembler().getEmitter().encodeInstruction(Inst, Data, Fixups, STI);
@@ -379,6 +370,7 @@ void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
F->setVarContents(Data);
F->setVarFixups(Fixups);
F->setInst(Inst);
+ newFragment();
}
void MCObjectStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
@@ -440,10 +432,11 @@ void MCObjectStreamer::emitDwarfAdvanceLineAddr(int64_t LineDelta,
return;
}
- auto *F = getOrCreateDataFragment();
+ auto *F = getCurrentFragment();
F->Kind = MCFragment::FT_Dwarf;
F->setDwarfAddrDelta(buildSymbolDiff(*this, Label, LastLabel, SMLoc()));
F->setDwarfLineDelta(LineDelta);
+ newFragment();
}
void MCObjectStreamer::emitDwarfLineEndEntry(MCSection *Section,
@@ -471,9 +464,10 @@ void MCObjectStreamer::emitDwarfLineEndEntry(MCSection *Section,
void MCObjectStreamer::emitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
const MCSymbol *Label,
SMLoc Loc) {
- auto *F = getOrCreateDataFragment();
+ auto *F = getCurrentFragment();
F->Kind = MCFragment::FT_DwarfFrame;
F->setDwarfAddrDelta(buildSymbolDiff(*this, Label, LastLabel, Loc));
+ newFragment();
}
void MCObjectStreamer::emitCVLocDirective(unsigned FunctionId, unsigned FileNo,
@@ -532,7 +526,7 @@ void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) {
void MCObjectStreamer::emitBytes(StringRef Data) {
MCDwarfLineEntry::make(this, getCurrentSectionOnly());
- MCFragment *DF = getOrCreateDataFragment();
+ MCFragment *DF = getCurrentFragment();
DF->appendContents(ArrayRef(Data.data(), Data.size()));
}
@@ -541,28 +535,21 @@ void MCObjectStreamer::emitValueToAlignment(Align Alignment, int64_t Fill,
unsigned MaxBytesToEmit) {
if (MaxBytesToEmit == 0)
MaxBytesToEmit = Alignment.value();
- insert(getContext().allocFragment<MCAlignFragment>(Alignment, Fill, FillLen,
- MaxBytesToEmit));
+ MCFragment *F = getCurrentFragment();
+ F->makeAlign(Alignment, Fill, FillLen, MaxBytesToEmit);
+ newFragment();
// Update the maximum alignment on the current section if necessary.
- MCSection *CurSec = getCurrentSectionOnly();
- CurSec->ensureMinAlignment(Alignment);
+ F->getParent()->ensureMinAlignment(Alignment);
}
void MCObjectStreamer::emitCodeAlignment(Align Alignment,
const MCSubtargetInfo *STI,
unsigned MaxBytesToEmit) {
+ auto *F = getCurrentFragment();
emitValueToAlignment(Alignment, 0, 1, MaxBytesToEmit);
- auto *F = cast<MCAlignFragment>(getCurrentFragment());
- F->setEmitNops(true, STI);
- // With RISC-V style linker relaxation, mark the section as linker-relaxable
- // if the alignment is larger than the minimum NOP size.
- unsigned Size;
- if (getAssembler().getBackend().shouldInsertExtraNopBytesForCodeAlign(*F,
- Size)) {
- getCurrentSectionOnly()->setLinkerRelaxable();
- newFragment();
- }
+ F->u.align.EmitNops = true;
+ F->STI = STI;
}
void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 77bf843..eda5e8c 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -3404,11 +3404,10 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, uint8_t ValueSize) {
const MCSection *Section = getStreamer().getCurrentSectionOnly();
assert(Section && "must have section to emit alignment");
- if (HasFillExpr && FillExpr != 0 && Section->isVirtualSection()) {
+ if (HasFillExpr && FillExpr != 0 && Section->isBssSection()) {
ReturnVal |=
- Warning(FillExprLoc, "ignoring non-zero fill value in " +
- Section->getVirtualSectionKind() +
- " section '" + Section->getName() + "'");
+ Warning(FillExprLoc, "ignoring non-zero fill value in BSS section '" +
+ Section->getName() + "'");
FillExpr = 0;
}
@@ -4094,27 +4093,30 @@ bool AsmParser::parseDirectiveCVFPOData() {
}
/// parseDirectiveCFISections
-/// ::= .cfi_sections section [, section]
+/// ::= .cfi_sections section [, section][, section]
bool AsmParser::parseDirectiveCFISections() {
StringRef Name;
bool EH = false;
bool Debug = false;
+ bool SFrame = false;
if (!parseOptionalToken(AsmToken::EndOfStatement)) {
for (;;) {
if (parseIdentifier(Name))
- return TokError("expected .eh_frame or .debug_frame");
+ return TokError("expected .eh_frame, .debug_frame, or .sframe");
if (Name == ".eh_frame")
EH = true;
else if (Name == ".debug_frame")
Debug = true;
+ else if (Name == ".sframe")
+ SFrame = true;
if (parseOptionalToken(AsmToken::EndOfStatement))
break;
if (parseComma())
return true;
}
}
- getStreamer().emitCFISections(EH, Debug);
+ getStreamer().emitCFISections(EH, Debug, SFrame);
return false;
}
diff --git a/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp b/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
index 7f09349..d7b0546 100644
--- a/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
@@ -8,8 +8,8 @@
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
#include "llvm/MC/MCRegister.h"
-#include "llvm/MC/MCStreamer.h"
using namespace llvm;
@@ -25,8 +25,9 @@ MCSubtargetInfo &MCTargetAsmParser::copySTI() {
STI = &STICopy;
// The returned STI will likely be modified. Create a new fragment to prevent
// mixing STI values within a fragment.
- if (getStreamer().getCurrentFragment())
- getStreamer().newFragment();
+ auto &S = getStreamer();
+ if (S.isObj() && S.getCurrentFragment())
+ static_cast<MCObjectStreamer &>(S).newFragment();
return STICopy;
}
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 9367145..023f7f2 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -18,10 +18,10 @@
using namespace llvm;
-MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText,
- bool IsVirtual, MCSymbol *Begin)
+MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsBss,
+ MCSymbol *Begin)
: Begin(Begin), HasInstructions(false), IsRegistered(false), IsText(IsText),
- IsVirtual(IsVirtual), LinkerRelaxable(false), Name(Name), Variant(V) {
+ IsBss(IsBss), LinkerRelaxable(false), Name(Name), Variant(V) {
// The initial subsection number is 0. Create a fragment list.
CurFragList = &Subsections.emplace_back(0u, FragList{}).second;
}
@@ -34,8 +34,6 @@ MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
bool MCSection::hasEnded() const { return End && End->isInSection(); }
-StringRef MCSection::getVirtualSectionKind() const { return "virtual"; }
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void MCSection::dump(
DenseMap<const MCFragment *, SmallVector<const MCSymbol *, 0>> *FragToSyms)
@@ -60,16 +58,6 @@ LLVM_DUMP_METHOD void MCSection::dump(
}
#endif
-void MCFragment::setContents(ArrayRef<char> Contents) {
- auto &S = getParent()->ContentStorage;
- if (ContentStart + Contents.size() > ContentEnd) {
- ContentStart = S.size();
- S.resize_for_overwrite(S.size() + Contents.size());
- }
- ContentEnd = ContentStart + Contents.size();
- llvm::copy(Contents, S.begin() + ContentStart);
-}
-
void MCFragment::setVarContents(ArrayRef<char> Contents) {
auto &S = getParent()->ContentStorage;
if (VarContentStart + Contents.size() > VarContentEnd) {
@@ -96,16 +84,6 @@ void MCFragment::appendFixups(ArrayRef<MCFixup> Fixups) {
FixupEnd = S.size();
}
-void MCFragment::setFixups(ArrayRef<MCFixup> Fixups) {
- auto &S = getParent()->FixupStorage;
- if (FixupStart + Fixups.size() > FixupEnd) {
- FixupStart = S.size();
- S.resize_for_overwrite(S.size() + Fixups.size());
- }
- FixupEnd = FixupStart + Fixups.size();
- llvm::copy(Fixups, S.begin() + FixupStart);
-}
-
void MCFragment::setVarFixups(ArrayRef<MCFixup> Fixups) {
auto &S = getParent()->FixupStorage;
if (VarFixupStart + Fixups.size() > VarFixupEnd) {
diff --git a/llvm/lib/MC/MCSectionCOFF.cpp b/llvm/lib/MC/MCSectionCOFF.cpp
index 94e29ce..5bf1473 100644
--- a/llvm/lib/MC/MCSectionCOFF.cpp
+++ b/llvm/lib/MC/MCSectionCOFF.cpp
@@ -115,7 +115,3 @@ void MCSectionCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
}
bool MCSectionCOFF::useCodeAlign() const { return isText(); }
-
-StringRef MCSectionCOFF::getVirtualSectionKind() const {
- return "IMAGE_SCN_CNT_UNINITIALIZED_DATA";
-}
diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp
index 299fe40..ef33f9c 100644
--- a/llvm/lib/MC/MCSectionELF.cpp
+++ b/llvm/lib/MC/MCSectionELF.cpp
@@ -215,5 +215,3 @@ void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
bool MCSectionELF::useCodeAlign() const {
return getFlags() & ELF::SHF_EXECINSTR;
}
-
-StringRef MCSectionELF::getVirtualSectionKind() const { return "SHT_NOBITS"; }
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index c3ecf8f..e14a32f 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -415,7 +415,7 @@ void MCStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
void MCStreamer::emitConditionalAssignment(MCSymbol *Symbol,
const MCExpr *Value) {}
-void MCStreamer::emitCFISections(bool EH, bool Debug) {}
+void MCStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) {}
void MCStreamer::emitCFIStartProc(bool IsSimple, SMLoc Loc) {
if (!FrameInfoStack.empty() &&
@@ -1404,7 +1404,7 @@ MCSymbol *MCStreamer::endSection(MCSection *Section) {
return Sym;
}
-void MCStreamer::insert(MCFragment *F) {
+void MCStreamer::addFragment(MCFragment *F) {
auto *Sec = CurFrag->getParent();
F->setParent(Sec);
F->setLayoutOrder(CurFrag->getLayoutOrder() + 1);
@@ -1413,10 +1413,6 @@ void MCStreamer::insert(MCFragment *F) {
Sec->curFragList()->Tail = F;
}
-void MCStreamer::newFragment() {
- insert(getContext().allocFragment<MCFragment>());
-}
-
static VersionTuple
targetVersionOrMinimumSupportedOSVersion(const Triple &Target,
VersionTuple TargetVersion) {
diff --git a/llvm/lib/MC/MCTargetOptions.cpp b/llvm/lib/MC/MCTargetOptions.cpp
index bff4b8d..be6d19d 100644
--- a/llvm/lib/MC/MCTargetOptions.cpp
+++ b/llvm/lib/MC/MCTargetOptions.cpp
@@ -19,7 +19,8 @@ MCTargetOptions::MCTargetOptions()
PreserveAsmComments(true), Dwarf64(false),
EmitDwarfUnwind(EmitDwarfUnwindType::Default),
MCUseDwarfDirectory(DefaultDwarfDirectory),
- EmitCompactUnwindNonCanonical(false), PPCUseFullRegisterNames(false) {}
+ EmitCompactUnwindNonCanonical(false), EmitSFrameUnwind(false),
+ PPCUseFullRegisterNames(false) {}
StringRef MCTargetOptions::getABIName() const {
return ABIName;
diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
index 2adc291..ff95ff7 100644
--- a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
+++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
@@ -41,6 +41,7 @@ MCOPT(int, DwarfVersion)
MCOPT(bool, Dwarf64)
MCOPT(EmitDwarfUnwindType, EmitDwarfUnwind)
MCOPT(bool, EmitCompactUnwindNonCanonical)
+MCOPT(bool, EmitSFrameUnwind)
MCOPT(bool, ShowMCInst)
MCOPT(bool, FatalWarnings)
MCOPT(bool, NoWarn)
@@ -105,6 +106,11 @@ llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() {
false)); // By default, use DWARF for non-canonical personalities.
MCBINDOPT(EmitCompactUnwindNonCanonical);
+ static cl::opt<bool> EmitSFrameUnwind(
+ "gsframe", cl::desc("Whether to emit .sframe unwind sections."),
+ cl::init(false));
+ MCBINDOPT(EmitSFrameUnwind);
+
static cl::opt<bool> ShowMCInst(
"asm-show-inst",
cl::desc("Emit internal instruction representation to assembly file"));
@@ -188,6 +194,7 @@ MCTargetOptions llvm::mc::InitMCTargetOptionsFromFlags() {
Options.X86Sse2Avx = getX86Sse2Avx();
Options.EmitDwarfUnwind = getEmitDwarfUnwind();
Options.EmitCompactUnwindNonCanonical = getEmitCompactUnwindNonCanonical();
+ Options.EmitSFrameUnwind = getEmitSFrameUnwind();
Options.AsSecureLogFile = getAsSecureLogFile();
return Options;
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index e8b26bf..72a8dd7 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -318,15 +318,13 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
// Emit the epilog instructions.
if (EnableUnwindV2) {
- MCFragment *DF = OS->getOrCreateDataFragment();
-
bool IsLast = true;
for (const auto &Epilog : llvm::reverse(info->EpilogMap)) {
if (IsLast) {
IsLast = false;
uint8_t Flags = LastEpilogIsAtEnd ? 0x01 : 0;
- streamer.emitInt8(EpilogSize);
- streamer.emitInt8((Flags << 4) | Win64EH::UOP_Epilog);
+ OS->emitInt8(EpilogSize);
+ OS->emitInt8((Flags << 4) | Win64EH::UOP_Epilog);
if (LastEpilogIsAtEnd)
continue;
@@ -337,9 +335,8 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
// layout has been completed.
auto *MCE = MCUnwindV2EpilogTargetExpr::create(*info, Epilog.second,
EpilogSize, context);
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_2);
- DF->addFixup(Fixup);
- DF->appendContents(2, 0);
+ OS->addFixup(MCE, FK_Data_2);
+ OS->appendContents(2, 0);
}
}
if (AddPaddingEpilogCode)
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 3398775..9369bea 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -153,7 +153,7 @@ void MCWinCOFFStreamer::initSections(bool NoExecStack,
}
void MCWinCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
- changeSectionImpl(Section, Subsection);
+ MCObjectStreamer::changeSection(Section, Subsection);
// Ensure that the first and the second symbols relative to the section are
// the section symbol and the COMDAT symbol.
getAssembler().registerSymbol(*Section->getBeginSymbol());
@@ -278,35 +278,28 @@ void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {
void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) {
visitUsedSymbol(*Symbol);
- MCFragment *DF = getOrCreateDataFragment();
const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, FK_SecRel_2);
- DF->addFixup(Fixup);
- DF->appendContents(2, 0);
+ addFixup(SRE, FK_SecRel_2);
+ appendContents(2, 0);
}
void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol,
uint64_t Offset) {
visitUsedSymbol(*Symbol);
- MCFragment *DF = getOrCreateDataFragment();
// Create Symbol A for the relocation relative reference.
const MCExpr *MCE = MCSymbolRefExpr::create(Symbol, getContext());
// Add the constant offset, if given.
if (Offset)
MCE = MCBinaryExpr::createAdd(
MCE, MCConstantExpr::create(Offset, getContext()), getContext());
- // Build the secrel32 relocation.
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_SecRel_4);
- // Record the relocation.
- DF->addFixup(Fixup);
+ addFixup(MCE, FK_SecRel_4);
// Emit 4 bytes (zeros) to the object file.
- DF->appendContents(4, 0);
+ appendContents(4, 0);
}
void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
int64_t Offset) {
visitUsedSymbol(*Symbol);
- MCFragment *DF = getOrCreateDataFragment();
// Create Symbol A for the relocation relative reference.
const MCExpr *MCE = MCSymbolRefExpr::create(
Symbol, MCSymbolRefExpr::VK_COFF_IMGREL32, getContext());
@@ -314,40 +307,29 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
if (Offset)
MCE = MCBinaryExpr::createAdd(
MCE, MCConstantExpr::create(Offset, getContext()), getContext());
- // Build the imgrel relocation.
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
- // Record the relocation.
- DF->addFixup(Fixup);
+ addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
- DF->appendContents(4, 0);
+ appendContents(4, 0);
}
void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
visitUsedSymbol(*Symbol);
- MCFragment *DF = getOrCreateDataFragment();
// Create Symbol for section number.
const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create(
*Symbol, this->getWriter(), getContext());
- // Build the relocation.
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
- // Record the relocation.
- DF->addFixup(Fixup);
+ addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
- DF->appendContents(4, 0);
+ appendContents(4, 0);
}
void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
visitUsedSymbol(*Symbol);
- MCFragment *DF = getOrCreateDataFragment();
// Create Symbol for section offset.
const MCExpr *MCE =
MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext());
- // Build the relocation.
- MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
- // Record the relocation.
- DF->addFixup(Fixup);
+ addFixup(MCE, FK_Data_4);
// Emit 4 bytes (zeros) to the object file.
- DF->appendContents(4, 0);
+ appendContents(4, 0);
}
void MCWinCOFFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size,
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 4d45296..63381b4 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -89,7 +89,7 @@ void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility(
void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) {
// Add a Fixup here to later record a relocation of type R_REF to prevent the
// ref symbol from being garbage collected (by the binder).
- MCFragment *DF = getOrCreateDataFragment();
+ MCFragment *DF = getCurrentFragment();
const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
std::optional<MCFixupKind> MaybeKind =
getAssembler().getBackend().getFixupKind("R_REF");
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 3291dd7..48d2fc6 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -131,7 +131,7 @@ uint64_t MachObjectWriter::getPaddingSize(const MCAssembler &Asm,
return 0;
const MCSection &NextSec = *SectionOrder[Next];
- if (NextSec.isVirtualSection())
+ if (NextSec.isBssSection())
return 0;
return offsetToAlignment(EndAddr, NextSec.getAlign());
}
@@ -267,7 +267,7 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm,
const MCSectionMachO &Section = cast<MCSectionMachO>(Sec);
// The offset is unused for virtual sections.
- if (Section.isVirtualSection()) {
+ if (Section.isBssSection()) {
assert(Asm.getSectionFileSize(Sec) == 0 && "Invalid file size!");
FileOffset = 0;
}
@@ -682,13 +682,13 @@ void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm) {
unsigned i = 0;
// Compute the section layout order. Virtual sections must go last.
for (MCSection &Sec : Asm) {
- if (!Sec.isVirtualSection()) {
+ if (!Sec.isBssSection()) {
SectionOrder.push_back(&Sec);
cast<MCSectionMachO>(Sec).setLayoutOrder(i++);
}
}
for (MCSection &Sec : Asm) {
- if (Sec.isVirtualSection()) {
+ if (Sec.isBssSection()) {
SectionOrder.push_back(&Sec);
cast<MCSectionMachO>(Sec).setLayoutOrder(i++);
}
@@ -797,11 +797,8 @@ uint64_t MachObjectWriter::writeObject() {
UndefinedSymbolData);
if (!CGProfile.empty()) {
- MCSection *CGProfileSection = getContext().getMachOSection(
- "__LLVM", "__cg_profile", 0, SectionKind::getMetadata());
- auto &Frag = *CGProfileSection->begin();
- Frag.clearContents();
- raw_svector_ostream OS(Frag.getContentsForAppending());
+ SmallString<0> Content;
+ raw_svector_ostream OS(Content);
for (const MCObjectWriter::CGProfileEntry &CGPE : CGProfile) {
uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
@@ -809,7 +806,9 @@ uint64_t MachObjectWriter::writeObject() {
support::endian::write(OS, ToIndex, W.Endian);
support::endian::write(OS, CGPE.Count, W.Endian);
}
- Frag.doneAppending();
+ MCSection *Sec = getContext().getMachOSection("__LLVM", "__cg_profile", 0,
+ SectionKind::getMetadata());
+ llvm::copy(OS.str(), Sec->curFragList()->Head->getContents().data());
}
unsigned NumSections = Asm.end() - Asm.begin();
@@ -883,7 +882,7 @@ uint64_t MachObjectWriter::writeObject() {
VMSize = std::max(VMSize, Address + Size);
- if (Sec.isVirtualSection())
+ if (Sec.isBssSection())
continue;
SectionDataSize = std::max(SectionDataSize, Address + Size);
@@ -915,7 +914,7 @@ uint64_t MachObjectWriter::writeObject() {
unsigned Flags = Sec.getTypeAndAttributes();
if (Sec.hasInstructions())
Flags |= MachO::S_ATTR_SOME_INSTRUCTIONS;
- if (!cast<MCSectionMachO>(Sec).isVirtualSection() &&
+ if (!cast<MCSectionMachO>(Sec).isBssSection() &&
!isUInt<32>(SectionStart)) {
getContext().reportError(
SMLoc(), "cannot encode offset of section; object file too large");
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index da6dbf3..3b99af4 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -696,14 +696,15 @@ static void addData(SmallVectorImpl<char> &DataBytes,
if (Frag.hasInstructions())
report_fatal_error("only data supported in data sections");
- if (auto *Align = dyn_cast<MCAlignFragment>(&Frag)) {
- if (Align->getFillLen() != 1)
+ llvm::append_range(DataBytes, Frag.getContents());
+ if (Frag.getKind() == MCFragment::FT_Align) {
+ if (Frag.getAlignFillLen() != 1)
report_fatal_error("only byte values supported for alignment");
// If nops are requested, use zeros, as this is the data section.
- uint8_t Value = Align->hasEmitNops() ? 0 : Align->getFill();
+ uint8_t Value = Frag.hasAlignEmitNops() ? 0 : Frag.getAlignFill();
uint64_t Size =
- std::min<uint64_t>(alignTo(DataBytes.size(), Align->getAlignment()),
- DataBytes.size() + Align->getMaxBytesToEmit());
+ std::min<uint64_t>(alignTo(DataBytes.size(), Frag.getAlignment()),
+ DataBytes.size() + Frag.getAlignMaxBytesToEmit());
DataBytes.resize(Size, Value);
} else if (auto *Fill = dyn_cast<MCFillFragment>(&Frag)) {
int64_t NumValues;
@@ -711,12 +712,10 @@ static void addData(SmallVectorImpl<char> &DataBytes,
llvm_unreachable("The fill should be an assembler constant");
DataBytes.insert(DataBytes.end(), Fill->getValueSize() * NumValues,
Fill->getValue());
+ } else if (Frag.getKind() == MCFragment::FT_LEB) {
+ llvm::append_range(DataBytes, Frag.getVarContents());
} else {
- llvm::append_range(DataBytes, Frag.getContents());
- if (Frag.getKind() == MCFragment::FT_LEB)
- llvm::append_range(DataBytes, Frag.getVarContents());
- else
- assert(Frag.getKind() == MCFragment::FT_Data);
+ assert(Frag.getKind() == MCFragment::FT_Data);
}
}
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index ee4d957..6ad4334 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -179,7 +179,7 @@ private:
void SetSymbolName(COFFSymbol &S);
void SetSectionName(COFFSection &S);
- bool IsPhysicalSection(COFFSection *S);
+ bool isUninitializedData(const COFFSection &S);
// Entity writing methods.
void WriteFileHeader(const COFF::header &Header);
@@ -453,8 +453,8 @@ void WinCOFFWriter::SetSymbolName(COFFSymbol &S) {
std::memcpy(S.Data.Name, S.Name.c_str(), S.Name.size());
}
-bool WinCOFFWriter::IsPhysicalSection(COFFSection *S) {
- return (S->Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) ==
+bool WinCOFFWriter::isUninitializedData(const COFFSection &S) {
+ return (S.Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) !=
0;
}
@@ -606,6 +606,9 @@ void WinCOFFWriter::writeSection(const COFFSection &Sec) {
assert(AuxSyms.size() == 1 && AuxSyms[0].AuxType == ATSectionDefinition);
AuxSymbol &SecDef = AuxSyms[0];
SecDef.Aux.SectionDefinition.CheckSum = CRC;
+ } else if (isUninitializedData(Sec)) {
+ // Error if fixups or non-zero bytes are present.
+ writeSectionContents(*Sec.MCSection);
}
// Write relocations for this section.
@@ -745,7 +748,7 @@ void WinCOFFWriter::assignFileOffsets() {
Sec->Header.SizeOfRawData = Asm->getSectionAddressSize(Section);
- if (IsPhysicalSection(Sec)) {
+ if (!isUninitializedData(*Sec)) {
Sec->Header.PointerToRawData = Offset;
Offset += Sec->Header.SizeOfRawData;
}
@@ -1067,10 +1070,8 @@ uint64_t WinCOFFWriter::writeObject() {
// Create the contents of the .llvm_addrsig section.
if (Mode != DwoOnly && OWriter.getEmitAddrsigSection()) {
- auto *Sec = getContext().getCOFFSection(".llvm_addrsig",
- COFF::IMAGE_SCN_LNK_REMOVE);
- auto *Frag = Sec->curFragList()->Head;
- raw_svector_ostream OS(Frag->getContentsForAppending());
+ SmallString<0> Content;
+ raw_svector_ostream OS(Content);
for (const MCSymbol *S : OWriter.AddrsigSyms) {
if (!S->isRegistered())
continue;
@@ -1085,15 +1086,15 @@ uint64_t WinCOFFWriter::writeObject() {
"executePostLayoutBinding!");
encodeULEB128(SectionMap[TargetSection]->Symbol->getIndex(), OS);
}
- Frag->doneAppending();
+ auto *Sec = getContext().getCOFFSection(".llvm_addrsig",
+ COFF::IMAGE_SCN_LNK_REMOVE);
+ Sec->curFragList()->Tail->setVarContents(OS.str());
}
// Create the contents of the .llvm.call-graph-profile section.
if (Mode != DwoOnly && !OWriter.getCGProfile().empty()) {
- auto *Sec = getContext().getCOFFSection(".llvm.call-graph-profile",
- COFF::IMAGE_SCN_LNK_REMOVE);
- auto *Frag = Sec->curFragList()->Head;
- raw_svector_ostream OS(Frag->getContentsForAppending());
+ SmallString<0> Content;
+ raw_svector_ostream OS(Content);
for (const auto &CGPE : OWriter.getCGProfile()) {
uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
@@ -1101,7 +1102,9 @@ uint64_t WinCOFFWriter::writeObject() {
support::endian::write(OS, ToIndex, W.Endian);
support::endian::write(OS, CGPE.Count, W.Endian);
}
- Frag->doneAppending();
+ auto *Sec = getContext().getCOFFSection(".llvm.call-graph-profile",
+ COFF::IMAGE_SCN_LNK_REMOVE);
+ Sec->curFragList()->Tail->setVarContents(OS.str());
}
assignFileOffsets();
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h
index 8f9444f..86c6b12 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.h
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.h
@@ -64,14 +64,14 @@ struct Section {
return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE);
}
- bool isVirtualSection() const {
+ bool isBssSection() const {
return (getType() == MachO::S_ZEROFILL ||
getType() == MachO::S_GB_ZEROFILL ||
getType() == MachO::S_THREAD_LOCAL_ZEROFILL);
}
bool hasValidOffset() const {
- return !(isVirtualSection() || OriginalOffset == 0);
+ return !(isBssSection() || OriginalOffset == 0);
}
};
diff --git a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
index 7c24d12..89c1df8 100644
--- a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
@@ -112,7 +112,7 @@ size_t MachOWriter::totalSize() const {
for (const std::unique_ptr<Section> &S : LC.Sections) {
if (!S->hasValidOffset()) {
assert((S->Offset == 0) && "Skipped section's offset must be zero");
- assert((S->isVirtualSection() || S->Size == 0) &&
+ assert((S->isBssSection() || S->Size == 0) &&
"Non-zero-fill sections with zero offset must have zero size");
continue;
}
@@ -240,7 +240,7 @@ void MachOWriter::writeSections() {
for (const std::unique_ptr<Section> &Sec : LC.Sections) {
if (!Sec->hasValidOffset()) {
assert((Sec->Offset == 0) && "Skipped section's offset must be zero");
- assert((Sec->isVirtualSection() || Sec->Size == 0) &&
+ assert((Sec->isBssSection() || Sec->Size == 0) &&
"Non-zero-fill sections with zero offset must have zero size");
continue;
}
diff --git a/llvm/lib/Object/CMakeLists.txt b/llvm/lib/Object/CMakeLists.txt
index 870169a..0f6d2f7 100644
--- a/llvm/lib/Object/CMakeLists.txt
+++ b/llvm/lib/Object/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_component_library(LLVMObject
OffloadBundle.cpp
RecordStreamer.cpp
RelocationResolver.cpp
+ SFrameParser.cpp
SymbolicFile.cpp
SymbolSize.cpp
TapiFile.cpp
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 5597d7d..0919c6a 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -620,7 +620,9 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
StringRef ELFObjectFileBase::getNVPTXCPUName() const {
assert(getEMachine() == ELF::EM_CUDA);
- unsigned SM = getPlatformFlags() & ELF::EF_CUDA_SM;
+ unsigned SM = getEIdentABIVersion() == ELF::ELFABIVERSION_CUDA_V1
+ ? getPlatformFlags() & ELF::EF_CUDA_SM
+ : getPlatformFlags() & ELF::EF_CUDA_SM_MASK;
switch (SM) {
// Fermi architecture.
@@ -679,7 +681,18 @@ StringRef ELFObjectFileBase::getNVPTXCPUName() const {
// Hopper architecture.
case ELF::EF_CUDA_SM90:
- return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS ? "sm_90a" : "sm_90";
+ return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS_V1 ? "sm_90a"
+ : "sm_90";
+
+ // Blackwell architecture.
+ case ELF::EF_CUDA_SM100:
+ return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS ? "sm_100a"
+ : "sm_100";
+
+ // Rubin architecture.
+ case ELF::EF_CUDA_SM120:
+ return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS ? "sm_120a"
+ : "sm_120";
default:
llvm_unreachable("Unknown EF_CUDA_SM value");
}
diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp
new file mode 100644
index 0000000..2d74d1d
--- /dev/null
+++ b/llvm/lib/Object/SFrameParser.cpp
@@ -0,0 +1,55 @@
+//===- SFrameParser.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/SFrameParser.h"
+#include "llvm/BinaryFormat/SFrame.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+template <typename T>
+static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data,
+ uint64_t Offset) {
+ static_assert(std::is_trivial_v<T>);
+ if (Data.size() < Offset + sizeof(T)) {
+ return createStringError(
+ formatv("unexpected end of data at offset {0:x} while reading [{1:x}, "
+ "{2:x})",
+ Data.size(), Offset, Offset + sizeof(T))
+ .str(),
+ object_error::unexpected_eof);
+ }
+ return *reinterpret_cast<const T *>(Data.data() + Offset);
+}
+
+template <endianness E>
+Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) {
+ Expected<const sframe::Preamble<E> &> Preamble =
+ getDataSliceAs<sframe::Preamble<E>>(Contents, 0);
+ if (!Preamble)
+ return Preamble.takeError();
+
+ if (Preamble->Magic != sframe::Magic)
+ return createError(
+ formatv("invalid magic number ({0:x+4})", Preamble->Magic.value()));
+ if (Preamble->Version != sframe::Version::V2)
+ return createError(
+ formatv("invalid/unsupported version number ({0})",
+ static_cast<unsigned>(Preamble->Version.value())));
+
+ Expected<const sframe::Header<E> &> Header =
+ getDataSliceAs<sframe::Header<E>>(Contents, 0);
+ if (!Header)
+ return Header.takeError();
+ return SFrameParser(Contents, *Header);
+}
+
+template class llvm::object::SFrameParser<endianness::big>;
+template class llvm::object::SFrameParser<endianness::little>;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 80fb52f..f810368 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -124,6 +124,7 @@
#include "llvm/CodeGen/MachineCopyPropagation.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
#include "llvm/CodeGen/MachineLICM.h"
#include "llvm/CodeGen/MachineLateInstrsCleanup.h"
#include "llvm/CodeGen/MachinePassManager.h"
@@ -363,6 +364,7 @@
#include "llvm/Transforms/Utils/MoveAutoInit.h"
#include "llvm/Transforms/Utils/NameAnonGlobals.h"
#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/ProfileVerify.h"
#include "llvm/Transforms/Utils/RelLookupTableConverter.h"
#include "llvm/Transforms/Utils/StripGCRelocates.h"
#include "llvm/Transforms/Utils/StripNonLineTableDebugInfo.h"
@@ -1189,9 +1191,13 @@ Expected<GVNOptions> parseGVNOptions(StringRef Params) {
} else if (ParamName == "split-backedge-load-pre") {
Result.setLoadPRESplitBackedge(Enable);
} else if (ParamName == "memdep") {
+ // MemDep and MemorySSA are mutually exclusive.
Result.setMemDep(Enable);
+ Result.setMemorySSA(!Enable);
} else if (ParamName == "memoryssa") {
+ // MemDep and MemorySSA are mutually exclusive.
Result.setMemorySSA(Enable);
+ Result.setMemDep(!Enable);
} else {
return make_error<StringError>(
formatv("invalid GVN pass parameter '{}'", ParamName).str(),
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index caa78b6..bb7ccdb 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -520,6 +520,8 @@ FUNCTION_PASS("print<regions>", RegionInfoPrinterPass(errs()))
FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(errs()))
FUNCTION_PASS("print<stack-safety-local>", StackSafetyPrinterPass(errs()))
FUNCTION_PASS("print<uniformity>", UniformityInfoPrinterPass(errs()))
+FUNCTION_PASS("prof-inject", ProfileInjectorPass())
+FUNCTION_PASS("prof-verify", ProfileVerifierPass())
FUNCTION_PASS("reassociate", ReassociatePass())
FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass())
FUNCTION_PASS("replace-with-veclib", ReplaceWithVeclib())
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 5c7b9e0..886add7 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1295,7 +1295,7 @@ Error IndexedInstrProfReader::readHeader() {
// Writer first writes the length of compressed string, and then the actual
// content.
const char *VTableNamePtr = (const char *)Ptr;
- if (VTableNamePtr > (const char *)DataBuffer->getBufferEnd())
+ if (VTableNamePtr > DataBuffer->getBufferEnd())
return make_error<InstrProfError>(instrprof_error::truncated);
VTableName = StringRef(VTableNamePtr, CompressedVTableNamesLen);
diff --git a/llvm/lib/Support/AArch64AttributeParser.cpp b/llvm/lib/Support/AArch64AttributeParser.cpp
index c675ef2..eed8dba 100644
--- a/llvm/lib/Support/AArch64AttributeParser.cpp
+++ b/llvm/lib/Support/AArch64AttributeParser.cpp
@@ -8,6 +8,7 @@
//===---------------------------------------------------------------------===//
#include "llvm/Support/AArch64AttributeParser.h"
+#include "llvm/Support/AArch64BuildAttributes.h"
std::vector<llvm::SubsectionAndTagToTagName> &
llvm::AArch64AttributeParser::returnTagsNamesMap() {
@@ -19,3 +20,29 @@ llvm::AArch64AttributeParser::returnTagsNamesMap() {
{"aeabi_feature_and_bits", 2, "Tag_Feature_GCS"}};
return TagsNamesMap;
}
+
+llvm::AArch64BuildAttrSubsections llvm::extractBuildAttributesSubsections(
+ const llvm::AArch64AttributeParser &Attributes) {
+
+ llvm::AArch64BuildAttrSubsections SubSections;
+ auto GetPauthValue = [&Attributes](unsigned Tag) {
+ return Attributes.getAttributeValue("aeabi_pauthabi", Tag).value_or(0);
+ };
+ SubSections.Pauth.TagPlatform =
+ GetPauthValue(llvm::AArch64BuildAttributes::TAG_PAUTH_PLATFORM);
+ SubSections.Pauth.TagSchema =
+ GetPauthValue(llvm::AArch64BuildAttributes::TAG_PAUTH_SCHEMA);
+
+ auto GetFeatureValue = [&Attributes](unsigned Tag) {
+ return Attributes.getAttributeValue("aeabi_feature_and_bits", Tag)
+ .value_or(0);
+ };
+ SubSections.AndFeatures |=
+ GetFeatureValue(llvm::AArch64BuildAttributes::TAG_FEATURE_BTI);
+ SubSections.AndFeatures |=
+ GetFeatureValue(llvm::AArch64BuildAttributes::TAG_FEATURE_PAC) << 1;
+ SubSections.AndFeatures |=
+ GetFeatureValue(llvm::AArch64BuildAttributes::TAG_FEATURE_GCS) << 2;
+
+ return SubSections;
+}
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index d5c3cba..8491633 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -68,11 +68,19 @@ template class LLVM_EXPORT_TEMPLATE basic_parser<float>;
template class LLVM_EXPORT_TEMPLATE basic_parser<std::string>;
template class LLVM_EXPORT_TEMPLATE basic_parser<char>;
-template class opt<unsigned>;
-template class opt<int>;
-template class opt<std::string>;
-template class opt<char>;
-template class opt<bool>;
+#if !(defined(LLVM_ENABLE_LLVM_EXPORT_ANNOTATIONS) && defined(_MSC_VER))
+// Only instantiate opt<std::string> when not building a Windows DLL. When
+// exporting opt<std::string>, MSVC implicitly exports symbols for
+// std::basic_string through transitive inheritance via std::string. These
+// symbols may appear in clients, leading to duplicate symbol conflicts.
+template class LLVM_EXPORT_TEMPLATE opt<std::string>;
+#endif
+
+template class LLVM_EXPORT_TEMPLATE opt<bool>;
+template class LLVM_EXPORT_TEMPLATE opt<char>;
+template class LLVM_EXPORT_TEMPLATE opt<int>;
+template class LLVM_EXPORT_TEMPLATE opt<unsigned>;
+
} // namespace cl
} // namespace llvm
@@ -95,6 +103,15 @@ void parser<float>::anchor() {}
void parser<std::string>::anchor() {}
void parser<char>::anchor() {}
+// These anchor functions instantiate opt<T> and reference its virtual
+// destructor to ensure MSVC exports the corresponding vtable and typeinfo when
+// building a Windows DLL. Without an explicit reference, MSVC may omit the
+// instantiation at link time even if it is marked DLL-export.
+void opt_bool_anchor() { opt<bool> anchor{""}; }
+void opt_char_anchor() { opt<char> anchor{""}; }
+void opt_int_anchor() { opt<int> anchor{""}; }
+void opt_unsigned_anchor() { opt<unsigned> anchor{""}; }
+
//===----------------------------------------------------------------------===//
const static size_t DefaultPad = 2;
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 12fc976..201bfe0 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1205,32 +1205,36 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
Register DstReg = MI.getOperand(0).getReg();
if (DstReg == MI.getOperand(3).getReg()) {
// Expand to BIT
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
- : AArch64::BITv16i8))
- .add(MI.getOperand(0))
- .add(MI.getOperand(3))
- .add(MI.getOperand(2))
- .add(MI.getOperand(1));
+ auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
+ : AArch64::BITv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(1));
+ transferImpOps(MI, I, I);
} else if (DstReg == MI.getOperand(2).getReg()) {
// Expand to BIF
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
- : AArch64::BIFv16i8))
- .add(MI.getOperand(0))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3))
- .add(MI.getOperand(1));
+ auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
+ : AArch64::BIFv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(1));
+ transferImpOps(MI, I, I);
} else {
// Expand to BSL, use additional move if required
if (DstReg == MI.getOperand(1).getReg()) {
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
- : AArch64::BSLv16i8))
- .add(MI.getOperand(0))
- .add(MI.getOperand(1))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3));
+ auto I =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+ : AArch64::BSLv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+ transferImpOps(MI, I, I);
} else {
BuildMI(MBB, MBBI, MI.getDebugLoc(),
TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8
@@ -1240,15 +1244,17 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
getRenamableRegState(MI.getOperand(0).isRenamable()))
.add(MI.getOperand(1))
.add(MI.getOperand(1));
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
- : AArch64::BSLv16i8))
- .add(MI.getOperand(0))
- .addReg(DstReg,
- RegState::Kill |
- getRenamableRegState(MI.getOperand(0).isRenamable()))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3));
+ auto I2 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+ : AArch64::BSLv16i8))
+ .add(MI.getOperand(0))
+ .addReg(DstReg,
+ RegState::Kill | getRenamableRegState(
+ MI.getOperand(0).isRenamable()))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+ transferImpOps(MI, I2, I2);
}
}
MI.eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f026726..ef3e8c8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -164,6 +164,9 @@ static cl::opt<bool> UseFEATCPACodegen(
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
+/// Value type used for NZCV flags.
+static constexpr MVT FlagsVT = MVT::i32;
+
static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
AArch64::X3, AArch64::X4, AArch64::X5,
AArch64::X6, AArch64::X7};
@@ -3451,7 +3454,7 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL,
}
unsigned Opcode =
IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
- return DAG.getNode(Opcode, DL, {MVT::i32, MVT::Other}, {Chain, LHS, RHS});
+ return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
}
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
@@ -3465,7 +3468,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
}
- return DAG.getNode(AArch64ISD::FCMP, DL, MVT::i32, LHS, RHS);
+ return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
}
// The CMP instruction is just an alias for SUBS, and representing it as
@@ -3490,7 +3493,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
// of the signed comparisons.
const SDValue ANDSNode =
- DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, MVT_CC),
+ DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
LHS.getOperand(0), LHS.getOperand(1));
// Replace all users of (and X, Y) with newly generated (ands X, Y)
DAG.ReplaceAllUsesWith(LHS, ANDSNode);
@@ -3501,7 +3504,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
}
}
- return DAG.getNode(Opcode, DL, DAG.getVTList(VT, MVT_CC), LHS, RHS)
+ return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
.getValue(1);
}
@@ -3597,7 +3600,7 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
- return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
+ return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
}
/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
@@ -4036,7 +4039,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
// Check that the result fits into a 32-bit integer.
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
+ SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
if (IsSigned) {
// cmp xreg, wreg, sxtw
SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
@@ -4059,12 +4062,12 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
DAG.getConstant(63, DL, MVT::i64));
// It is important that LowerBits is last, otherwise the arithmetic
// shift will not be folded into the compare (SUBS).
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
.getValue(1);
} else {
SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
Overflow =
DAG.getNode(AArch64ISD::SUBS, DL, VTs,
DAG.getConstant(0, DL, MVT::i64),
@@ -4075,7 +4078,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
} // switch (...)
if (Opc) {
- SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
// Emit the AArch64 operation with overflow check.
Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
@@ -4177,7 +4180,7 @@ static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
SDValue Cmp =
- DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
+ DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
return Cmp.getValue(1);
}
@@ -4220,16 +4223,15 @@ static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
SDLoc DL(Op);
- SDVTList VTs = DAG.getVTList(VT0, VT1);
- SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
+ SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
OpRHS, OpCarryIn);
SDValue OutFlag =
IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
: carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
- return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
+ return DAG.getMergeValues({Sum, OutFlag}, DL);
}
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
@@ -4254,8 +4256,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
Overflow =
DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Value, Overflow);
+ return DAG.getMergeValues({Value, Overflow}, DL);
}
// Prefetch operands are:
@@ -7037,9 +7038,8 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
Op.getOperand(0));
// Generate SUBS & CSEL.
- SDValue Cmp =
- DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
- Op.getOperand(0), DAG.getConstant(0, DL, VT));
+ SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
+ Op.getOperand(0), DAG.getConstant(0, DL, VT));
return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
Cmp.getValue(1));
@@ -11108,7 +11108,7 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
SDValue Carry = Op.getOperand(2);
// SBCS uses a carry not a borrow so the carry flag should be inverted first.
SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
- SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
+ SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
LHS, RHS, InvCarry);
EVT OpVT = Op.getValueType();
@@ -12441,10 +12441,10 @@ SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
// Get NZCV register. Only update chain when copyfrom is glued.
if (Glue.getNode()) {
- Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
+ Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
Chain = Glue.getValue(1);
} else
- Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
+ Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
// Extract CC code.
SDValue CC = getSETCC(Cond, Glue, DL, DAG);
@@ -17343,12 +17343,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
-bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
+ Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
+ auto *SI = dyn_cast<StoreInst>(Store);
+ if (!SI)
+ return false;
+ assert(!LaneMask && "Unexpected mask on store");
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
@@ -18015,11 +18020,14 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
unsigned ShlAmt = C2->getZExtValue();
if (auto ShouldADD = *N->user_begin();
ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
- if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
- unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8;
- if ((1ULL << ShlAmt) == ByteVT &&
- isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT()))
- return false;
+ if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
+ EVT MemVT = Load->getMemoryVT();
+
+ if (Load->getValueType(0).isScalableVector())
+ return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
+
+ if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
+ return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
}
}
}
@@ -18588,7 +18596,7 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
Created.push_back(And.getNode());
} else {
SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
- SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ SDVTList VTs = DAG.getVTList(VT, FlagsVT);
SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
@@ -19477,10 +19485,10 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
// can select to CCMN to avoid the extra mov
SDValue AbsOp1 =
DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
- CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
- NZCVOp, Condition, Cmp0);
+ CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
+ AbsOp1, NZCVOp, Condition, Cmp0);
} else {
- CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
+ CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
}
return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
@@ -25129,8 +25137,9 @@ static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
if (!TReassocOp && !FReassocOp)
return SDValue();
- SDValue NewCmp = DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
- DAG.getVTList(VT, MVT_CC), CmpOpOther, SubsOp);
+ SDValue NewCmp =
+ DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
+ DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);
auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
if (!ReassocOp)
@@ -27156,7 +27165,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
: AArch64SysReg::RNDRRS);
SDLoc DL(N);
SDValue A = DAG.getNode(
- AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::i32, MVT::Other),
+ AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
SDValue B = DAG.getNode(
AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 713793e..d8403c2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -215,7 +215,8 @@ public:
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+ ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index bc57537..802e4a9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -533,8 +533,9 @@ bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
MBP.LHS = LastInst->getOperand(0);
MBP.RHS = MachineOperand::CreateImm(0);
- MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
- : MachineBranchPredicate::PRED_EQ;
+ MBP.Predicate = (LastOpc == AArch64::CBNZX || LastOpc == AArch64::CBNZW)
+ ? MachineBranchPredicate::PRED_NE
+ : MachineBranchPredicate::PRED_EQ;
return false;
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 9f8a257..9ebdf2e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -430,26 +430,27 @@ def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
[SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>,
- SDTCisInt<0>, SDTCisVT<1, i32>]>;
+ SDTCisInt<0>,
+ SDTCisVT<1, FlagsVT>]>;
// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisInt<0>,
- SDTCisVT<3, i32>]>;
+ SDTCisVT<3, FlagsVT>]>;
// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
[SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>,
SDTCisInt<0>,
- SDTCisVT<1, i32>,
- SDTCisVT<4, i32>]>;
+ SDTCisVT<1, FlagsVT>,
+ SDTCisVT<4, FlagsVT>]>;
def SDT_AArch64Brcond : SDTypeProfile<0, 3,
[SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
- SDTCisVT<2, i32>]>;
+ SDTCisVT<2, FlagsVT>]>;
def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
SDTCisVT<2, OtherVT>]>;
@@ -458,22 +459,22 @@ def SDT_AArch64CSel : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisInt<3>,
- SDTCisVT<4, i32>]>;
+ SDTCisVT<4, FlagsVT>]>;
def SDT_AArch64CCMP : SDTypeProfile<1, 5,
- [SDTCisVT<0, i32>,
+ [SDTCisVT<0, FlagsVT>,
SDTCisInt<1>,
SDTCisSameAs<1, 2>,
SDTCisInt<3>,
SDTCisInt<4>,
SDTCisVT<5, i32>]>;
def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
- [SDTCisVT<0, i32>,
+ [SDTCisVT<0, FlagsVT>,
SDTCisFP<1>,
SDTCisSameAs<1, 2>,
SDTCisInt<3>,
SDTCisInt<4>,
SDTCisVT<5, i32>]>;
-def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, FlagsVT>,
SDTCisFP<1>,
SDTCisSameAs<2, 1>]>;
def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
@@ -1124,10 +1125,10 @@ def AArch64probedalloca
SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPMayStore]>;
-// MRS, also sets the flags via a glue.
+// MRS, also sets the flags.
def AArch64mrs : SDNode<"AArch64ISD::MRS",
SDTypeProfile<2, 1, [SDTCisVT<0, i64>,
- SDTCisVT<1, i32>,
+ SDTCisVT<1, FlagsVT>,
SDTCisVT<2, i32>]>,
[SDNPHasChain]>;
@@ -3934,6 +3935,26 @@ defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+// load zero-extended i32, bitcast to f64
+def : Pat <(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+
+// load zero-extended i16, bitcast to f64
+def : Pat <(f64 (bitconvert (i64 (zextloadi16 (am_indexed32 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+
+// load zero-extended i8, bitcast to f64
+def : Pat <(f64 (bitconvert (i64 (zextloadi8 (am_indexed32 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+
+// load zero-extended i16, bitcast to f32
+def : Pat <(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+
+// load zero-extended i8, bitcast to f32
+def : Pat <(f32 (bitconvert (i32 (zextloadi8 (am_indexed16 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+
// Pre-fetch.
def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
[(AArch64Prefetch timm:$Rt,
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 0ddd17c..abcd550 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -8,8 +8,8 @@
//
// This pass performs below peephole optimizations on MIR level.
//
-// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
-// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
+// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
//
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
@@ -126,7 +126,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
template <typename T>
- bool visitAND(unsigned Opc, MachineInstr &MI);
+ bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0);
bool visitORR(MachineInstr &MI);
bool visitCSEL(MachineInstr &MI);
bool visitINSERT(MachineInstr &MI);
@@ -194,12 +194,12 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
}
template <typename T>
-bool AArch64MIPeepholeOpt::visitAND(
- unsigned Opc, MachineInstr &MI) {
+bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
+ unsigned OtherOpc) {
// Try below transformation.
//
- // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
- // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+ // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
+ // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
//
// The mov pseudo instruction could be expanded to multiple mov instructions
// later. Let's try to split the constant operand of mov instruction into two
@@ -208,10 +208,10 @@ bool AArch64MIPeepholeOpt::visitAND(
return splitTwoPartImm<T>(
MI,
- [Opc](T Imm, unsigned RegSize, T &Imm0,
- T &Imm1) -> std::optional<OpcodePair> {
+ [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
+ T &Imm1) -> std::optional<OpcodePair> {
if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
- return std::make_pair(Opc, Opc);
+ return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
return std::nullopt;
},
[&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
@@ -864,6 +864,12 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
case AArch64::ANDXrr:
Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
break;
+ case AArch64::ANDSWrr:
+ Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri);
+ break;
+ case AArch64::ANDSXrr:
+ Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri);
+ break;
case AArch64::ORRWrs:
Changed |= visitORR(MI);
break;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 61bf87f..1a7609b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -305,7 +305,8 @@ def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
// Condition code regclass.
-def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+defvar FlagsVT = i32;
+def CCR : RegisterClass<"AArch64", [FlagsVT], 32, (add NZCV)> {
let CopyCost = -1; // Don't allow copying of status registers.
// CCR is not allocatable.
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index bafb8d0..8a5b5ba 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -32,10 +32,29 @@ AArch64SelectionDAGInfo::AArch64SelectionDAGInfo()
void AArch64SelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
const SDNode *N) const {
+ SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
+
#ifndef NDEBUG
+ // Some additional checks not yet implemented by verifyTargetNode.
+ constexpr MVT FlagsVT = MVT::i32;
switch (N->getOpcode()) {
- default:
- return SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
+ case AArch64ISD::SUBS:
+ assert(N->getValueType(1) == FlagsVT);
+ break;
+ case AArch64ISD::ADC:
+ case AArch64ISD::SBC:
+ assert(N->getOperand(2).getValueType() == FlagsVT);
+ break;
+ case AArch64ISD::ADCS:
+ case AArch64ISD::SBCS:
+ assert(N->getValueType(1) == FlagsVT);
+ assert(N->getOperand(2).getValueType() == FlagsVT);
+ break;
+ case AArch64ISD::CSEL:
+ case AArch64ISD::CSINC:
+ case AArch64ISD::BRCOND:
+ assert(N->getOperand(3).getValueType() == FlagsVT);
+ break;
case AArch64ISD::SADDWT:
case AArch64ISD::SADDWB:
case AArch64ISD::UADDWT:
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 75c7dd9..f136a184 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -581,7 +581,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
// statement if return_twice functions are called.
bool StandardLifetime =
!SInfo.CallsReturnTwice &&
- SInfo.UnrecognizedLifetimes.empty() &&
memtag::isStandardLifetime(Info.LifetimeStart, Info.LifetimeEnd, DT, LI,
ClMaxLifetimes);
if (StandardLifetime) {
@@ -616,10 +615,5 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
memtag::annotateDebugRecords(Info, Tag);
}
- // If we have instrumented at least one alloca, all unrecognized lifetime
- // intrinsics have to go.
- for (auto *I : SInfo.UnrecognizedLifetimes)
- I->eraseFromParent();
-
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 2409cc8..0f4f012 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -534,7 +534,7 @@ unsigned AArch64Subtarget::classifyGlobalFunctionReference(
}
void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+ const SchedRegion &Region) const {
// LNT run (at least on Cyclone) showed reasonably significant gains for
// bi-directional scheduling. 253.perlbmk.
Policy.OnlyTopDown = false;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 154db3c..061ed61 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -343,7 +343,8 @@ public:
}
void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
+
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
SDep &Dep,
const TargetSchedModel *SchedModel) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 90d3d92..40f49da 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -249,7 +249,7 @@ static bool hasPossibleIncompatibleOps(const Function *F) {
return false;
}
-uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const {
+APInt AArch64TTIImpl::getFeatureMask(const Function &F) const {
StringRef AttributeStr =
isMultiversionedFunction(F) ? "fmv-features" : "target-features";
StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index b27eb2e..7f45177 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -89,7 +89,7 @@ public:
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
unsigned DefaultCallPenalty) const override;
- uint64_t getFeatureMask(const Function &F) const override;
+ APInt getFeatureMask(const Function &F) const override;
bool isMultiversionedFunction(const Function &F) const override;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 3d4a14b..1a9bce5 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -9,8 +9,6 @@
#include "AArch64MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 0e0e83b..8b8fc8b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -149,6 +149,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
"Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
>;
+def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts",
+ "HasFmaMixBF16Insts",
+ "true",
+ "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions"
+>;
+
def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts",
"HasIEEEMinimumMaximumInsts",
"true",
@@ -167,6 +173,12 @@ def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16",
"Has v_minimum3_f16 and v_maximum3_f16 instructions"
>;
+def FeatureMin3Max3PKF16 : SubtargetFeature<"min3-max3-pkf16",
+ "HasMin3Max3PKF16",
+ "true",
+ "Has v_pk_min3_num_f16 and v_pk_max3_num_f16 instructions"
+>;
+
def FeatureMinimum3Maximum3PKF16 : SubtargetFeature<"minimum3-maximum3-pkf16",
"HasMinimum3Maximum3PKF16",
"true",
@@ -256,12 +268,24 @@ def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug",
"S_INST_PREFETCH instruction causes shader to hang"
>;
+def FeatureVmemPrefInsts : SubtargetFeature<"vmem-pref-insts",
+ "HasVmemPrefInsts",
+ "true",
+ "Has flat_prefect_b8 and global_prefetch_b8 instructions"
+>;
+
def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch",
"HasSafeSmemPrefetch",
"true",
"SMEM prefetches do not fail on illegal address"
>;
+def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
+ "HasSafeCUPrefetch",
+ "true",
+ "VMEM CU scope prefetches do not fail on illegal address"
+>;
+
def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
"HasVcmpxExecWARHazard",
"true",
@@ -559,6 +583,12 @@ def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
"Has bf16 conversion instructions"
>;
+def FeatureBF16PackedInsts : SubtargetFeature<"bf16-pk-insts",
+ "HasBF16PackedInsts",
+ "true",
+ "Has bf16 packed instructions (fma, add, mul, max, min)"
+>;
+
def FeatureVOP3P : SubtargetFeature<"vop3p",
"HasVOP3PInsts",
"true",
@@ -1349,6 +1379,10 @@ def FeatureLshlAddU64Inst
: SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
"Has v_lshl_add_u64 instruction">;
+def FeatureAddSubU64Insts
+ : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
+ "Has v_add_u64 and v_sub_u64 instructions">;
+
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
"HasVMemToLDSLoad",
"true",
@@ -1848,7 +1882,8 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureImageInsts,
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
- FeatureMemoryAtomicFAddF32DenormalSupport]>;
+ FeatureMemoryAtomicFAddF32DenormalSupport,
+ FeatureRealTrue16Insts]>;
// There are few workarounds that need to be
// added to all targets. This pessimizes codegen
@@ -1868,8 +1903,7 @@ def FeatureISAVersion11_0_Common : FeatureSet<
[FeatureMSAALoadDstSelBug,
FeatureVALUTransUseHazard,
FeatureMADIntraFwdBug,
- FeaturePrivEnabledTrap2NopBug,
- FeatureRealTrue16Insts])>;
+ FeaturePrivEnabledTrap2NopBug])>;
def FeatureISAVersion11_0_0 : FeatureSet<
!listconcat(FeatureISAVersion11_0_Common.Features,
@@ -1989,7 +2023,10 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureTransposeLoadF4F6Insts,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
+ FeatureBF16PackedInsts,
FeatureCvtPkF16F32Inst,
+ FeatureFmaMixBF16Insts,
+ FeatureMin3Max3PKF16,
FeatureMinimum3Maximum3PKF16,
FeaturePrngInst,
FeaturePermlane16Swap,
@@ -2002,7 +2039,9 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureFlatBufferGlobalAtomicFaddF64Inst,
FeatureMemoryAtomicFAddF32DenormalSupport,
FeatureKernargPreload,
+ FeatureVmemPrefInsts,
FeatureLshlAddU64Inst,
+ FeatureAddSubU64Insts,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
]>;
@@ -2349,6 +2388,10 @@ def HasMinimum3Maximum3F16 :
Predicate<"Subtarget->hasMinimum3Maximum3F16()">,
AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>;
+def HasMin3Max3PKF16 :
+ Predicate<"Subtarget->hasMin3Max3PKF16()">,
+ AssemblerPredicate<(all_of FeatureMin3Max3PKF16)>;
+
def HasMinimum3Maximum3PKF16 :
Predicate<"Subtarget->hasMinimum3Maximum3PKF16()">,
AssemblerPredicate<(all_of FeatureMinimum3Maximum3PKF16)>;
@@ -2472,6 +2515,9 @@ def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;
+def HasBF16PackedInsts : Predicate<"Subtarget->hasBF16PackedInsts()">,
+ AssemblerPredicate<(all_of FeatureBF16PackedInsts)>;
+
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<(all_of FeatureVOP3P)>;
@@ -2519,6 +2565,14 @@ def HasFmaakFmamkF64Insts :
Predicate<"Subtarget->hasFmaakFmamkF64Insts()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+def HasPkAddMinMaxInsts :
+ Predicate<"Subtarget->hasPkAddMinMaxInsts()">,
+ AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+
+def HasPkMinMax3Insts :
+ Predicate<"Subtarget->hasPkMinMax3Insts()">,
+ AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+
def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">,
AssemblerPredicate<(all_of FeatureImageInsts)>;
@@ -2565,6 +2619,9 @@ def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
AssemblerPredicate<(all_of FeatureFmaMixInsts)>;
+def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">,
+ AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>;
+
def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
AssemblerPredicate<(all_of FeatureDLInsts)>;
@@ -2763,12 +2820,18 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
AssemblerPredicate<(all_of FeatureXF32Insts)>;
+def HasVmemPrefInsts : Predicate<"Subtarget->hasVmemPrefInsts()">,
+ AssemblerPredicate<(all_of FeatureVmemPrefInsts)>;
+
def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
AssemblerPredicate<(all_of FeatureAshrPkInsts)>;
def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>;
+def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
+ AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
+
def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 749b9ef..4b3dc37 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1415,6 +1415,7 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
+ MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
if (AMDGPU::isCompute(CC)) {
MD->setHwStage(CC, ".trap_present",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index dedee46..49d8b44 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1383,7 +1383,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID,
- &AAIndirectCallInfo::ID, &AAInstanceInfo::ID});
+ &AAIndirectCallInfo::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 14101e5..3d8d274 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -374,8 +374,10 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
return true;
}
- unsigned ReturnOpc =
- IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
+ const bool IsWholeWave = MFI->isWholeWaveFunction();
+ unsigned ReturnOpc = IsWholeWave ? AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN
+ : IsShader ? AMDGPU::SI_RETURN_TO_EPILOG
+ : AMDGPU::SI_RETURN;
auto Ret = B.buildInstrNoInsert(ReturnOpc);
if (!FLI.CanLowerReturn)
@@ -383,6 +385,9 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
else if (!lowerReturnVal(B, Val, VRegs, Ret))
return false;
+ if (IsWholeWave)
+ addOriginalExecToReturn(B.getMF(), Ret);
+
// TODO: Handle CalleeSavedRegsViaCopy.
B.insertInstr(Ret);
@@ -632,6 +637,17 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (DL.getTypeStoreSize(Arg.getType()) == 0)
continue;
+ if (Info->isWholeWaveFunction() && Idx == 0) {
+ assert(VRegs[Idx].size() == 1 && "Expected only one register");
+
+ // The first argument for whole wave functions is the original EXEC value.
+ B.buildInstr(AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
+ .addDef(VRegs[Idx][0]);
+
+ ++Idx;
+ continue;
+ }
+
const bool InReg = Arg.hasAttribute(Attribute::InReg);
if (Arg.hasAttribute(Attribute::SwiftSelf) ||
@@ -1347,6 +1363,7 @@ bool AMDGPUCallLowering::lowerTailCall(
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
+ Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave &&
!AMDGPU::isChainCC(Info.CallConv)) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
@@ -1524,7 +1541,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// after the ordinary user argument registers.
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
- if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
+ if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
+ Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
return false;
@@ -1592,3 +1610,11 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return true;
}
+
+void AMDGPUCallLowering::addOriginalExecToReturn(
+ MachineFunction &MF, MachineInstrBuilder &Ret) const {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF);
+ Ret.addReg(Setup->getOperand(0).getReg());
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index a6e801f..e0033d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -37,6 +37,9 @@ class AMDGPUCallLowering final : public CallLowering {
bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
+ void addOriginalExecToReturn(MachineFunction &MF,
+ MachineInstrBuilder &Ret) const;
+
public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 2bfd56f..c01e5d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -137,6 +137,9 @@ def gi_global_offset :
def gi_global_saddr :
GIComplexOperandMatcher<s64, "selectGlobalSAddr">,
GIComplexPatternEquiv<GlobalSAddr>;
+def gi_global_saddr_cpol :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">,
+ GIComplexPatternEquiv<GlobalSAddrCPol>;
def gi_global_saddr_glc :
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
GIComplexPatternEquiv<GlobalSAddrGLC>;
@@ -315,6 +318,10 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;
+def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
+// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
+// so we don't mark it as equivalent.
+
class GISelSop2Pat <
SDPatternOperator node,
Instruction inst,
@@ -442,5 +449,8 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">,
GISDNodeXFormEquiv<as_hw_round_mode>;
+def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">,
+ GISDNodeXFormEquiv<PrefetchLoc>;
+
def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">,
GISDNodeXFormEquiv<MFMALdScaleXForm>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 00979f4..f36935d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -117,45 +117,72 @@ static LLT getReadAnyLaneSplitTy(LLT Ty) {
return LLT::scalar(32);
}
-static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
- const RegisterBankInfo &RBI);
-
-static void unmergeReadAnyLane(MachineIRBuilder &B,
- SmallVectorImpl<Register> &SgprDstParts,
- LLT UnmergeTy, Register VgprSrc,
- const RegisterBankInfo &RBI) {
+template <typename ReadLaneFnTy>
+static Register buildReadLane(MachineIRBuilder &, Register,
+ const RegisterBankInfo &, ReadLaneFnTy);
+
+template <typename ReadLaneFnTy>
+static void
+unmergeReadAnyLane(MachineIRBuilder &B, SmallVectorImpl<Register> &SgprDstParts,
+ LLT UnmergeTy, Register VgprSrc, const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildRL) {
const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID);
auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc);
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
- SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI));
+ SgprDstParts.push_back(buildReadLane(B, Unmerge.getReg(i), RBI, BuildRL));
}
}
-static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
- const RegisterBankInfo &RBI) {
+template <typename ReadLaneFnTy>
+static Register buildReadLane(MachineIRBuilder &B, Register VgprSrc,
+ const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildRL) {
LLT Ty = B.getMRI()->getType(VgprSrc);
const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID);
if (Ty.getSizeInBits() == 32) {
- return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc})
- .getReg(0);
+ Register SgprDst = B.getMRI()->createVirtualRegister({SgprRB, Ty});
+ return BuildRL(B, SgprDst, VgprSrc).getReg(0);
}
SmallVector<Register, 8> SgprDstParts;
- unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
+ unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
+ BuildRL);
return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0);
}
-void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
- Register VgprSrc, const RegisterBankInfo &RBI) {
+template <typename ReadLaneFnTy>
+static void buildReadLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildReadLane) {
LLT Ty = B.getMRI()->getType(VgprSrc);
if (Ty.getSizeInBits() == 32) {
- B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
+ BuildReadLane(B, SgprDst, VgprSrc);
return;
}
SmallVector<Register, 8> SgprDstParts;
- unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
+ unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
+ BuildReadLane);
B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0);
}
+
+void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI) {
+ return buildReadLane(
+ B, SgprDst, VgprSrc, RBI,
+ [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
+ return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
+ });
+}
+
+void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI) {
+ return buildReadLane(
+ B, SgprDst, VgprSrc, RBI,
+ [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
+ return B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, SgprDst)
+ .addReg(VgprSrc);
+ });
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 0c89bb5..5e1000e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -51,6 +51,8 @@ private:
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);
+void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
+ const RegisterBankInfo &RBI);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 00c7f0e..dfaa145 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1863,9 +1863,17 @@ bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
SIInstrFlags::FlatScratch);
}
-// If this matches zero_extend i32:x, return x
-static SDValue matchZExtFromI32(SDValue Op) {
- if (Op.getOpcode() != ISD::ZERO_EXTEND)
+// If this matches *_extend i32:x, return x
+// Otherwise if the value is I32 returns x.
+static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
+ const SelectionDAG *DAG) {
+ if (Op.getValueType() == MVT::i32)
+ return Op;
+
+ if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
+ Op.getOpcode() != ISD::ANY_EXTEND &&
+ !(DAG->SignBitIsZero(Op) &&
+ Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
return SDValue();
SDValue ExtSrc = Op.getOperand(0);
@@ -1873,12 +1881,13 @@ static SDValue matchZExtFromI32(SDValue Op) {
}
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
-bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
- SDValue Addr,
- SDValue &SAddr,
- SDValue &VOffset,
- SDValue &Offset) const {
+// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
+ SDValue &SAddr, SDValue &VOffset,
+ SDValue &Offset, bool &ScaleOffset,
+ bool NeedIOffset) const {
int64_t ImmOffset = 0;
+ ScaleOffset = false;
// Match the immediate offset first, which canonically is moved as low as
// possible.
@@ -1888,7 +1897,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
+ if (NeedIOffset &&
+ TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
SIInstrFlags::FlatGlobal)) {
Addr = LHS;
ImmOffset = COffsetVal;
@@ -1898,11 +1908,14 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
// saddr + large_offset -> saddr +
// (voffset = large_offset & ~MaxOffset) +
// (large_offset & MaxOffset);
- int64_t SplitImmOffset, RemainderOffset;
- std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
- COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+ int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
+ if (NeedIOffset) {
+ std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
+ COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+ }
- if (isUInt<32>(RemainderOffset)) {
+ if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
+ : isUInt<32>(RemainderOffset)) {
SDNode *VMov = CurDAG->getMachineNode(
AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
@@ -1929,21 +1942,26 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
// Match the variable offset.
if (Addr.getOpcode() == ISD::ADD) {
LHS = Addr.getOperand(0);
- RHS = Addr.getOperand(1);
if (!LHS->isDivergent()) {
- // add (i64 sgpr), (zero_extend (i32 vgpr))
- if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
+ // add (i64 sgpr), (*_extend (i32 vgpr))
+ RHS = Addr.getOperand(1);
+ ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
+ if (SDValue ExtRHS = matchExtFromI32orI32(
+ RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
SAddr = LHS;
- VOffset = ZextRHS;
+ VOffset = ExtRHS;
}
}
+ RHS = Addr.getOperand(1);
if (!SAddr && !RHS->isDivergent()) {
- // add (zero_extend (i32 vgpr)), (i64 sgpr)
- if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
+ // add (*_extend (i32 vgpr)), (i64 sgpr)
+ ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
+ if (SDValue ExtLHS = matchExtFromI32orI32(
+ LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
SAddr = RHS;
- VOffset = ZextLHS;
+ VOffset = ExtLHS;
}
}
@@ -1953,6 +1971,27 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
}
}
+ if (Subtarget->hasScaleOffset() &&
+ (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
+ ? AMDGPUISD::MAD_I64_I32
+ : AMDGPUISD::MAD_U64_U32) ||
+ (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
+ CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
+ Addr.getOperand(0)->isDivergent() &&
+ isa<ConstantSDNode>(Addr.getOperand(1)) &&
+ !Addr.getOperand(2)->isDivergent()) {
+ // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
+ unsigned Size =
+ (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
+ ScaleOffset = Addr.getConstantOperandVal(1) == Size;
+ if (ScaleOffset) {
+ SAddr = Addr.getOperand(2);
+ VOffset = Addr.getOperand(0);
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
+ return true;
+ }
+ }
+
if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
isa<ConstantSDNode>(Addr))
return false;
@@ -1972,10 +2011,28 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,
SDValue &CPol) const {
- if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
+ bool ScaleOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
return false;
- CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
+ CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+ SDLoc(), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
+ SDValue &SAddr, SDValue &VOffset,
+ SDValue &Offset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
+ return false;
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
+ CPol = CurDAG->getTargetConstant(
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
return true;
}
@@ -1983,10 +2040,11 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,
SDValue &CPol) const {
- if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
+ bool ScaleOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
return false;
- unsigned CPolVal = AMDGPU::CPol::GLC;
+ unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
return true;
}
@@ -2074,7 +2132,8 @@ bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
SDValue &VAddr, SDValue &SAddr,
- SDValue &Offset) const {
+ SDValue &Offset,
+ SDValue &CPol) const {
int64_t ImmOffset = 0;
SDValue LHS, RHS;
@@ -2106,6 +2165,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
return false;
Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
+ CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
return true;
}
}
@@ -2139,6 +2199,10 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return false;
SAddr = SelectSAddrFI(CurDAG, SAddr);
Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
+
+ bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
+ CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+ SDLoc(), MVT::i32);
return true;
}
@@ -2159,17 +2223,59 @@ bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
return true;
}
+// Given \p Offset and load node \p N check if an \p Offset is a multiple of
+// the load byte size. If it is update \p Offset to a pre-scaled value and
+// return true.
+bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
+ bool IsSigned) const {
+ bool ScaleOffset = false;
+ if (!Subtarget->hasScaleOffset() || !Offset)
+ return false;
+
+ unsigned Size =
+ (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
+
+ SDValue Off = Offset;
+ if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
+ Off = Ext;
+
+ if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
+ ScaleOffset = C->getZExtValue() == Log2_32(Size);
+ } else if (Offset.getOpcode() == ISD::MUL ||
+ (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
+ Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
+ (Offset.isMachineOpcode() &&
+ Offset.getMachineOpcode() ==
+ (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
+ : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
+ ScaleOffset = C->getZExtValue() == Size;
+ }
+
+ if (ScaleOffset)
+ Offset = Off.getOperand(0);
+
+ return ScaleOffset;
+}
+
// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
// not null) offset. If Imm32Only is true, match only 32-bit immediate
// offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
SDValue *SOffset, SDValue *Offset,
bool Imm32Only, bool IsBuffer,
- bool HasSOffset,
- int64_t ImmOffset) const {
+ bool HasSOffset, int64_t ImmOffset,
+ bool *ScaleOffset) const {
assert((!SOffset || !Offset) &&
"Cannot match both soffset and offset at the same time!");
+ if (ScaleOffset) {
+ assert(N && SOffset);
+
+ *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
+ }
+
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
if (!C) {
if (!SOffset)
@@ -2254,24 +2360,25 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
// Match a base and an immediate (if Offset is not null) or an SGPR (if
// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
// true, match only 32-bit immediate offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
- SDValue *SOffset, SDValue *Offset,
- bool Imm32Only, bool IsBuffer,
- bool HasSOffset,
- int64_t ImmOffset) const {
+bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
+ SDValue &SBase, SDValue *SOffset,
+ SDValue *Offset, bool Imm32Only,
+ bool IsBuffer, bool HasSOffset,
+ int64_t ImmOffset,
+ bool *ScaleOffset) const {
if (SOffset && Offset) {
assert(!Imm32Only && !IsBuffer);
SDValue B;
- if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
+ if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
return false;
int64_t ImmOff = 0;
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
ImmOff = C->getSExtValue();
- return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
- ImmOff);
+ return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
+ true, ImmOff, ScaleOffset);
}
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -2291,23 +2398,25 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
if (!N0 || !N1)
return false;
- if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
- ImmOffset)) {
+ if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+ ImmOffset, ScaleOffset)) {
SBase = N0;
return true;
}
- if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
- ImmOffset)) {
+ if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+ ImmOffset, ScaleOffset)) {
SBase = N1;
return true;
}
return false;
}
-bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
SDValue *SOffset, SDValue *Offset,
- bool Imm32Only) const {
- if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
+ bool Imm32Only, bool *ScaleOffset) const {
+ if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
+ /* IsBuffer */ false, /* HasSOffset */ false,
+ /* ImmOffset */ 0, ScaleOffset)) {
SBase = Expand32BitAddress(SBase);
return true;
}
@@ -2323,36 +2432,51 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
+ return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
+ &Offset);
}
bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
- return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
- /* Imm32Only */ true);
+ return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
+ &Offset, /* Imm32Only */ true);
}
-bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
- SDValue &SOffset) const {
- return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
+bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
+ SDValue &SOffset, SDValue &CPol) const {
+ bool ScaleOffset;
+ if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
+ /* Imm32Only */ false, &ScaleOffset))
+ return false;
+
+ CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+ SDLoc(N), MVT::i32);
+ return true;
}
-bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
- SDValue &SOffset,
- SDValue &Offset) const {
- return SelectSMRD(Addr, SBase, &SOffset, &Offset);
+bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
+ SDValue &SBase, SDValue &SOffset,
+ SDValue &Offset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
+ return false;
+
+ CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+ SDLoc(N), MVT::i32);
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
- return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+ return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
/* Imm32Only */ false, /* IsBuffer */ true);
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
SDValue &Offset) const {
assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
- return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+ return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
/* Imm32Only */ true, /* IsBuffer */ true);
}
@@ -2361,9 +2485,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
// Match the (soffset + offset) pair as a 32-bit register base and
// an immediate offset.
return N.getValueType() == MVT::i32 &&
- SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
- &Offset, /* Imm32Only */ false,
- /* IsBuffer */ true);
+ SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
+ /* SOffset*/ nullptr, &Offset,
+ /* Imm32Only */ false, /* IsBuffer */ true);
}
bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
@@ -3753,58 +3877,114 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
return SelectVOP3Mods(In, Src, SrcMods);
}
+// Match lowered fpext from bf16 to f32. This is a bit operation extending
+// a 16-bit value with 16-bit of zeroes at LSB:
+//
+// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
+// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
+// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
+static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
+ if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
+ return SDValue();
+ Op = Op.getOperand(0);
+
+ IsExtractHigh = false;
+ if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
+ auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+ if (!Low16 || !Low16->isZero())
+ return SDValue();
+ Op = stripBitcast(Op.getOperand(1));
+ if (Op.getValueType() != MVT::bf16)
+ return SDValue();
+ return Op;
+ }
+
+ if (Op.getValueType() != MVT::i32)
+ return SDValue();
+
+ if (Op.getOpcode() == ISD::AND) {
+ if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (Mask->getZExtValue() == 0xffff0000) {
+ IsExtractHigh = true;
+ return Op.getOperand(0);
+ }
+ }
+ return SDValue();
+ }
+
+ if (Op.getOpcode() == ISD::SHL) {
+ if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (Amt->getZExtValue() == 16)
+ return Op.getOperand(0);
+ }
+ }
+
+ return SDValue();
+}
+
// The return value is not whether the match is possible (which it always is),
// but whether or not it a conversion is really used.
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
- unsigned &Mods) const {
+ unsigned &Mods,
+ MVT VT) const {
Mods = 0;
SelectVOP3ModsImpl(In, Src, Mods);
+ bool IsExtractHigh = false;
if (Src.getOpcode() == ISD::FP_EXTEND) {
Src = Src.getOperand(0);
- assert(Src.getValueType() == MVT::f16);
- Src = stripBitcast(Src);
+ } else if (VT == MVT::bf16) {
+ SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
+ if (!B16)
+ return false;
+ Src = B16;
+ } else
+ return false;
- // Be careful about folding modifiers if we already have an abs. fneg is
- // applied last, so we don't want to apply an earlier fneg.
- if ((Mods & SISrcMods::ABS) == 0) {
- unsigned ModsTmp;
- SelectVOP3ModsImpl(Src, Src, ModsTmp);
+ if (Src.getValueType() != VT &&
+ (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
+ return false;
- if ((ModsTmp & SISrcMods::NEG) != 0)
- Mods ^= SISrcMods::NEG;
+ Src = stripBitcast(Src);
- if ((ModsTmp & SISrcMods::ABS) != 0)
- Mods |= SISrcMods::ABS;
- }
+ // Be careful about folding modifiers if we already have an abs. fneg is
+ // applied last, so we don't want to apply an earlier fneg.
+ if ((Mods & SISrcMods::ABS) == 0) {
+ unsigned ModsTmp;
+ SelectVOP3ModsImpl(Src, Src, ModsTmp);
- // op_sel/op_sel_hi decide the source type and source.
- // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
- // If the sources's op_sel is set, it picks the high half of the source
- // register.
+ if ((ModsTmp & SISrcMods::NEG) != 0)
+ Mods ^= SISrcMods::NEG;
- Mods |= SISrcMods::OP_SEL_1;
- if (isExtractHiElt(Src, Src)) {
- Mods |= SISrcMods::OP_SEL_0;
+ if ((ModsTmp & SISrcMods::ABS) != 0)
+ Mods |= SISrcMods::ABS;
+ }
- // TODO: Should we try to look for neg/abs here?
- }
+ // op_sel/op_sel_hi decide the source type and source.
+ // If the source's op_sel_hi is set, it indicates to do a conversion from
+ // fp16. If the sources's op_sel is set, it picks the high half of the source
+ // register.
- // Prevent unnecessary subreg COPY to VGPR_16
- if (Src.getOpcode() == ISD::TRUNCATE &&
- Src.getOperand(0).getValueType() == MVT::i32) {
- Src = Src.getOperand(0);
- }
- return true;
+ Mods |= SISrcMods::OP_SEL_1;
+ if (IsExtractHigh ||
+ (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
+ Mods |= SISrcMods::OP_SEL_0;
+
+ // TODO: Should we try to look for neg/abs here?
}
- return false;
+ // Prevent unnecessary subreg COPY to VGPR_16
+ if (Src.getOpcode() == ISD::TRUNCATE &&
+ Src.getOperand(0).getValueType() == MVT::i32) {
+ Src = Src.getOperand(0);
+ }
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
- if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
+ if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
return false;
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
@@ -3813,7 +3993,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
- SelectVOP3PMadMixModsImpl(In, Src, Mods);
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
+ return false;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index acbab3d..5636d89 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -19,6 +19,7 @@
#include "SIModeRegisterDefaults.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -162,10 +163,14 @@ private:
bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset) const;
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
- SDValue &VOffset, SDValue &Offset) const;
+ SDValue &VOffset, SDValue &Offset, bool &ScaleOffset,
+ bool NeedIOffset = true) const;
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
+ bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &Offset,
+ SDValue &CPol) const;
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
@@ -174,24 +179,31 @@ private:
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
uint64_t ImmOffset) const;
bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
- SDValue &SAddr, SDValue &Offset) const;
+ SDValue &SAddr, SDValue &Offset,
+ SDValue &CPol) const;
- bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset,
+ bool SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false,
bool IsBuffer = false, bool HasSOffset = false,
- int64_t ImmOffset = 0) const;
+ int64_t ImmOffset = 0,
+ bool *ScaleOffset = nullptr) const;
SDValue Expand32BitAddress(SDValue Addr) const;
- bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
- SDValue *Offset, bool Imm32Only = false,
- bool IsBuffer = false, bool HasSOffset = false,
- int64_t ImmOffset = 0) const;
- bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
- SDValue *Offset, bool Imm32Only = false) const;
+ bool SelectSMRDBaseOffset(SDNode *N, SDValue Addr, SDValue &SBase,
+ SDValue *SOffset, SDValue *Offset,
+ bool Imm32Only = false, bool IsBuffer = false,
+ bool HasSOffset = false, int64_t ImmOffset = 0,
+ bool *ScaleOffset = nullptr) const;
+ bool SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset,
+ SDValue *Offset, bool Imm32Only = false,
+ bool *ScaleOffset = nullptr) const;
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
- bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const;
- bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset,
- SDValue &Offset) const;
+ bool SelectScaleOffset(SDNode *N, SDValue &Offset, bool IsSigned) const;
+ bool SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, SDValue &SOffset,
+ SDValue &CPol) const;
+ bool SelectSMRDSgprImm(SDNode *N, SDValue Addr, SDValue &SBase,
+ SDValue &SOffset, SDValue &Offset,
+ SDValue &CPol) const;
bool SelectSMRDBufferImm(SDValue N, SDValue &Offset) const;
bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
@@ -246,11 +258,15 @@ private:
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
- unsigned &Mods) const;
+ bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods,
+ MVT VT) const;
bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
SDValue &SrcMods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
+ bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
SDValue &Tbl) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3d040fb..e3ca09e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -375,7 +375,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
- setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
@@ -1143,6 +1142,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::Cold:
return CC_AMDGPU_Func;
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return CC_SI_Gfx;
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
@@ -1168,6 +1168,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
case CallingConv::AMDGPU_LS:
return RetCC_SI_Shader;
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return RetCC_SI_Gfx;
case CallingConv::C:
case CallingConv::Fast:
@@ -5875,6 +5876,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
+ NODE_NAME_CASE(WHOLE_WAVE_SETUP)
+ NODE_NAME_CASE(WHOLE_WAVE_RETURN)
}
return nullptr;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 4e8c6c7..39bb0ad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -608,6 +608,12 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_FMAX,
BUFFER_ATOMIC_COND_SUB_U32,
LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32,
+
+ // Set up a whole wave function.
+ WHOLE_WAVE_SETUP,
+
+ // Return from a whole wave function.
+ WHOLE_WAVE_RETURN,
};
} // End namespace AMDGPUISD
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index e2c2e89..f2207ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1694,6 +1694,47 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
NewII->takeName(&II);
return IC.replaceInstUsesWith(II, NewII);
}
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
+ Value *Src0 = II.getArgOperand(1);
+ Value *Src1 = II.getArgOperand(3);
+ unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
+ uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
+ auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
+ auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
+
+ bool MadeChange = false;
+ unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
+ unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
+
+ // Depending on the used format, fewer registers are required so shrink the
+ // vector type.
+ if (Src0Ty->getNumElements() > Src0NumElts) {
+ Src0 = IC.Builder.CreateExtractVector(
+ FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
+ IC.Builder.getInt64(0));
+ MadeChange = true;
+ }
+
+ if (Src1Ty->getNumElements() > Src1NumElts) {
+ Src1 = IC.Builder.CreateExtractVector(
+ FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
+ IC.Builder.getInt64(0));
+ MadeChange = true;
+ }
+
+ if (!MadeChange)
+ return std::nullopt;
+
+ SmallVector<Value *, 13> Args(II.args());
+ Args[1] = Src0;
+ Args[3] = Src1;
+
+ CallInst *NewII = IC.Builder.CreateIntrinsic(
+ IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
+ Args, &II);
+ NewII->takeName(&II);
+ return IC.replaceInstUsesWith(II, NewII);
+ }
}
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index ce58e93a..e305f08 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -348,6 +348,17 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+// Marks the entry into a whole wave function.
+def AMDGPUwhole_wave_setup : SDNode<
+ "AMDGPUISD::WHOLE_WAVE_SETUP", SDTypeProfile<1, 0, [SDTCisInt<0>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+// Marks the return from a whole wave function.
+def AMDGPUwhole_wave_return : SDNode<
+ "AMDGPUISD::WHOLE_WAVE_RETURN", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
SDTCisInt<0>, // i8 tgt
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index d161c03..266dee1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3494,25 +3494,74 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
}
/// Match a zero extend from a 32-bit value to 64-bits.
-static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
+Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
Register ZExtSrc;
- if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
- return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+ if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
+ return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
// Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
- const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
return Register();
assert(Def->getNumOperands() == 3 &&
- MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
- if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
+ MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+ if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
return Def->getOperand(1).getReg();
}
return Register();
}
+/// Match a sign extend from a 32-bit value to 64-bits.
+Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
+ Register SExtSrc;
+ if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
+ return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
+
+ // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
+ if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+ return Register();
+
+ assert(Def->getNumOperands() == 3 &&
+ MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+ if (mi_match(Def->getOperand(2).getReg(), *MRI,
+ m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
+ m_SpecificICst(31))))
+ return Def->getOperand(1).getReg();
+
+ if (VT->signBitIsZero(Reg))
+ return matchZeroExtendFromS32(Reg);
+
+ return Register();
+}
+
+/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
+/// is 32-bit.
+Register
+AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
+ return MRI->getType(Reg) == LLT::scalar(32) ? Reg
+ : matchZeroExtendFromS32(Reg);
+}
+
+/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
+/// is 32-bit.
+Register
+AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
+ return MRI->getType(Reg) == LLT::scalar(32) ? Reg
+ : matchSignExtendFromS32(Reg);
+}
+
+Register
+AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
+ bool IsSigned) const {
+ if (IsSigned)
+ return matchSignExtendFromS32OrS32(Reg);
+
+ return matchZeroExtendFromS32OrS32(Reg);
+}
+
Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
Register AnyExtSrc;
if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
@@ -3581,7 +3630,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
if (isSGPR(SAddr)) {
Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
- if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
Addr = SAddr;
VOffset = Off;
}
@@ -4160,6 +4209,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return true;
case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
return selectWaveAddress(I);
+ case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
+ I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
+ return true;
+ }
case AMDGPU::G_STACKRESTORE:
return selectStackRestore(I);
case AMDGPU::G_PHI:
@@ -5219,7 +5272,7 @@ AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
unsigned Key = 0;
- Register S32 = matchZeroExtendFromS32(*MRI, Src);
+ Register S32 = matchZeroExtendFromS32(Src);
if (!S32)
S32 = matchAnyExtendFromS32(Src);
@@ -5292,10 +5345,68 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
}};
}
+// Given \p Offset and load specified by the \p Root operand check if \p Offset
+// is a multiple of the load byte size. If it is update \p Offset to a
+// pre-scaled value and return true.
+bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
+ Register &Offset,
+ bool IsSigned) const {
+ if (!Subtarget->hasScaleOffset())
+ return false;
+
+ const MachineInstr &MI = *Root.getParent();
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+
+ if (!MMO->getSize().hasValue())
+ return false;
+
+ uint64_t Size = MMO->getSize().getValue();
+
+ Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
+ if (!OffsetReg)
+ OffsetReg = Offset;
+
+ if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
+ OffsetReg = Def->Reg;
+
+ Register Op0;
+ MachineInstr *Mul;
+ bool ScaleOffset =
+ (isPowerOf2_64(Size) &&
+ mi_match(OffsetReg, *MRI,
+ m_GShl(m_Reg(Op0),
+ m_any_of(m_SpecificICst(Log2_64(Size)),
+ m_Copy(m_SpecificICst(Log2_64(Size))))))) ||
+ mi_match(OffsetReg, *MRI,
+ m_GMul(m_Reg(Op0), m_any_of(m_SpecificICst(Size),
+ m_Copy(m_SpecificICst(Size))))) ||
+ mi_match(
+ OffsetReg, *MRI,
+ m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
+ m_Reg(Op0), m_SpecificICst(Size))) ||
+ // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
+ (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
+ (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
+ : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
+ (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
+ VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
+ mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
+ mi_match(Mul->getOperand(3).getReg(), *MRI,
+ m_GTrunc(m_any_of(m_SpecificICst(Size),
+ m_Copy(m_SpecificICst(Size))))) &&
+ mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
+
+ if (ScaleOffset)
+ Offset = Op0;
+
+ return ScaleOffset;
+}
+
bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
Register &Base,
Register *SOffset,
- int64_t *Offset) const {
+ int64_t *Offset,
+ bool *ScaleOffset) const {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
@@ -5310,6 +5421,9 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
const GEPInfo &GEPI = AddrInfo[0];
std::optional<int64_t> EncodedImm;
+ if (ScaleOffset)
+ *ScaleOffset = false;
+
if (SOffset && Offset) {
EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
/*HasSOffset=*/true);
@@ -5317,8 +5431,12 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
AddrInfo.size() > 1) {
const GEPInfo &GEPI2 = AddrInfo[1];
if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
- if (Register OffsetReg =
- matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
+ Register OffsetReg = GEPI2.SgprParts[1];
+ if (ScaleOffset)
+ *ScaleOffset =
+ selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
+ OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
+ if (OffsetReg) {
Base = GEPI2.SgprParts[0];
*SOffset = OffsetReg;
*Offset = *EncodedImm;
@@ -5363,7 +5481,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
}
if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
- if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
+ Register OffsetReg = GEPI.SgprParts[1];
+ if (ScaleOffset)
+ *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
+ OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
+ if (OffsetReg) {
Base = GEPI.SgprParts[0];
*SOffset = OffsetReg;
return true;
@@ -5377,7 +5499,8 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
Register Base;
int64_t Offset;
- if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
+ if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
+ /* ScaleOffset */ nullptr))
return std::nullopt;
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
@@ -5408,23 +5531,30 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
Register Base, SOffset;
- if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
+ bool ScaleOffset;
+ if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
+ &ScaleOffset))
return std::nullopt;
+ unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
- [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
Register Base, SOffset;
int64_t Offset;
- if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
+ bool ScaleOffset;
+ if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
return std::nullopt;
+ unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
}
std::pair<Register, int>
@@ -5486,7 +5616,8 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
- unsigned CPolBits) const {
+ unsigned CPolBits,
+ bool NeedIOffset) const {
Register Addr = Root.getReg();
Register PtrBase;
int64_t ConstOffset;
@@ -5497,7 +5628,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
if (ConstOffset != 0) {
- if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
+ if (NeedIOffset &&
+ TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
SIInstrFlags::FlatGlobal)) {
Addr = PtrBase;
ImmOffset = ConstOffset;
@@ -5510,11 +5642,15 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
// saddr + large_offset -> saddr +
// (voffset = large_offset & ~MaxOffset) +
// (large_offset & MaxOffset);
- int64_t SplitImmOffset, RemainderOffset;
- std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
- ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+ int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
+ if (NeedIOffset) {
+ std::tie(SplitImmOffset, RemainderOffset) =
+ TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
+ SIInstrFlags::FlatGlobal);
+ }
- if (isUInt<32>(RemainderOffset)) {
+ if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
+ : isUInt<32>(RemainderOffset)) {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
Register HighBits =
@@ -5524,12 +5660,22 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
HighBits)
.addImm(RemainderOffset);
+ if (NeedIOffset)
+ return {{
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addReg(PtrBase);
+ }, // saddr
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addReg(HighBits);
+ }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
+ }};
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
[=](MachineInstrBuilder &MIB) {
MIB.addReg(HighBits);
}, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
}};
}
@@ -5561,18 +5707,33 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
// It's possible voffset is an SGPR here, but the copy to VGPR will be
// inserted later.
- if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
+ Subtarget->hasSignedGVSOffset());
+ if (Register VOffset = matchExtendFromS32OrS32(
+ PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
+ if (NeedIOffset)
+ return {{[=](MachineInstrBuilder &MIB) { // saddr
+ MIB.addReg(SAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // voffset
+ MIB.addReg(VOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(ImmOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // cpol
+ MIB.addImm(CPolBits |
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
+ }}};
return {{[=](MachineInstrBuilder &MIB) { // saddr
MIB.addReg(SAddr);
},
[=](MachineInstrBuilder &MIB) { // voffset
MIB.addReg(VOffset);
},
- [=](MachineInstrBuilder &MIB) { // offset
- MIB.addImm(ImmOffset);
- },
[=](MachineInstrBuilder &MIB) { // cpol
- MIB.addImm(CPolBits);
+ MIB.addImm(CPolBits |
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
}}};
}
}
@@ -5593,10 +5754,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
.addImm(0);
+ if (NeedIOffset)
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
+ }};
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
[=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
}};
}
@@ -5607,6 +5774,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
+ const MachineInstr &I = *Root.getParent();
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
+ return selectGlobalSAddr(Root, PassedCPol);
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
}
@@ -5728,22 +5905,32 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
return std::nullopt;
+ unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
+ ? AMDGPU::CPol::SCAL
+ : 0;
+
if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
int FI = LHSDef->MI->getOperand(1).getIndex();
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
[=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
}};
}
if (!isSGPR(LHS))
+ if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
+ LHS = Def->Reg;
+
+ if (!isSGPR(LHS))
return std::nullopt;
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
- [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
}};
}
@@ -6891,6 +7078,17 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
}
+void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ uint32_t V = MI.getOperand(2).getImm();
+ V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK))
+ << AMDGPU::CPol::SCOPE_SHIFT;
+ if (!Subtarget->hasSafeCUPrefetch())
+ V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
+ MIB.addImm(V);
+}
+
/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 34bdf0a..fe9743d0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -232,8 +232,10 @@ private:
InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand &Root) const;
+ bool selectScaleOffset(MachineOperand &Root, Register &Offset,
+ bool IsSigned) const;
bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
- int64_t *Offset) const;
+ int64_t *Offset, bool *ScaleOffset) const;
InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
@@ -254,10 +256,13 @@ private:
selectScratchOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
- selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits) const;
+ selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits,
+ bool NeedIOffset = true) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectGlobalSAddrCPol(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
selectGlobalSAddrGLC(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
@@ -411,6 +416,10 @@ private:
void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+
+ void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+
void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB,
const MachineInstr &MI, int OpIdx) const;
@@ -421,6 +430,19 @@ private:
// shift amount operand's `ShAmtBits` bits is unneeded.
bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const;
+ /// Match a zero extend from a 32-bit value to 64-bits.
+ Register matchZeroExtendFromS32(Register Reg) const;
+ /// Match a sign extend from a 32-bit value to 64-bits.
+ Register matchSignExtendFromS32(Register Reg) const;
+ /// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
+ /// is 32-bit.
+ Register matchZeroExtendFromS32OrS32(Register Reg) const;
+ /// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
+ /// is 32-bit.
+ Register matchSignExtendFromS32OrS32(Register Reg) const;
+ /// Match either sign or zero extend depending on the \p IsSigned from a
+ /// 32-bit value to 64-bits, or \p Reg itself if it is 32-bit.
+ Register matchExtendFromS32OrS32(Register Reg, bool IsSigned) const;
/// Match an any extend from a 32-bit value to 64-bit.
Register matchAnyExtendFromS32(Register Reg) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e7bf88d..fedfa3f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4208,6 +4208,9 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
assert(Ty.isScalar());
unsigned Size = Ty.getSizeInBits();
+ if (ST.hasVectorMulU64() && Size == 64)
+ return true;
+
unsigned NumParts = Size / 32;
assert((Size % 32) == 0);
assert(NumParts >= 2);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index fa8af68..304e91e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1583,15 +1583,13 @@ void SplitPtrStructs::killAndReplaceSplitInstructions(
if (!SplitUsers.contains(I))
continue;
- SmallVector<DbgValueInst *> Dbgs;
- findDbgValues(Dbgs, I);
- for (auto *Dbg : Dbgs) {
- IRB.SetInsertPoint(Dbg);
+ SmallVector<DbgVariableRecord *> Dbgs;
+ findDbgValues(I, Dbgs);
+ for (DbgVariableRecord *Dbg : Dbgs) {
auto &DL = I->getDataLayout();
assert(isSplitFatPtr(I->getType()) &&
"We should've RAUW'd away loads, stores, etc. at this point");
- auto *OffDbg = cast<DbgValueInst>(Dbg->clone());
- copyMetadata(OffDbg, Dbg);
+ DbgVariableRecord *OffDbg = Dbg->clone();
auto [Rsrc, Off] = getPtrParts(I);
int64_t RsrcSz = DL.getTypeSizeInBits(Rsrc->getType());
@@ -1606,9 +1604,9 @@ void SplitPtrStructs::killAndReplaceSplitInstructions(
if (OffExpr) {
OffDbg->setExpression(*OffExpr);
OffDbg->replaceVariableLocationOp(I, Off);
- IRB.Insert(OffDbg);
+ OffDbg->insertBefore(Dbg);
} else {
- OffDbg->deleteValue();
+ OffDbg->eraseFromParent();
}
if (RsrcExpr) {
Dbg->setExpression(*RsrcExpr);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index ba66134..e187959 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -23,6 +23,8 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -115,126 +117,233 @@ public:
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
- bool isLaneMask(Register Reg) {
- const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
- if (RB && RB->getID() == AMDGPU::VCCRegBankID)
- return true;
+ bool isLaneMask(Register Reg);
+ std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode);
+ std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src);
+ Register getReadAnyLaneSrc(Register Src);
+ void replaceRegWithOrBuildCopy(Register Dst, Register Src);
- const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
- return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
- }
+ bool tryEliminateReadAnyLane(MachineInstr &Copy);
+ void tryCombineCopy(MachineInstr &MI);
+ void tryCombineS1AnyExt(MachineInstr &MI);
+};
- void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
- MI.eraseFromParent();
- if (Optional0 && isTriviallyDead(*Optional0, MRI))
- Optional0->eraseFromParent();
- }
+bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
+ const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+ if (RB && RB->getID() == AMDGPU::VCCRegBankID)
+ return true;
- std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) {
- MachineInstr *MatchMI = MRI.getVRegDef(Src);
- if (MatchMI->getOpcode() != Opcode)
- return {nullptr, Register()};
- return {MatchMI, MatchMI->getOperand(1).getReg()};
- }
+ const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+ return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
+}
- void tryCombineCopy(MachineInstr &MI) {
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- // Skip copies of physical registers.
- if (!Dst.isVirtual() || !Src.isVirtual())
- return;
-
- // This is a cross bank copy, sgpr S1 to lane mask.
- //
- // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
- // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
- // ->
- // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
- if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
- auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
- assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
- "sgpr S1 must be result of G_TRUNC of sgpr S32");
-
- B.setInstr(MI);
- // Ensure that truncated bits in BoolSrc are 0.
- auto One = B.buildConstant({SgprRB, S32}, 1);
- auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
- B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+std::pair<MachineInstr *, Register>
+AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
+ MachineInstr *MatchMI = MRI.getVRegDef(Src);
+ if (MatchMI->getOpcode() != Opcode)
+ return {nullptr, Register()};
+ return {MatchMI, MatchMI->getOperand(1).getReg()};
+}
+
+std::pair<GUnmerge *, int>
+AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
+ MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
+ if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
+ return {nullptr, -1};
+
+ Register RALSrc = ReadAnyLane->getOperand(1).getReg();
+ if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI))
+ return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
- // Src = G_AMDGPU_READANYLANE RALSrc
- // Dst = COPY Src
- // ->
- // Dst = RALSrc
- if (MRI.getRegBankOrNull(Dst) == VgprRB &&
- MRI.getRegBankOrNull(Src) == SgprRB) {
- auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
- if (!RAL)
- return;
-
- assert(MRI.getRegBank(RALSrc) == VgprRB);
- MRI.replaceRegWith(Dst, RALSrc);
- cleanUpAfterCombine(MI, RAL);
- return;
+ return {nullptr, -1};
+}
+
+Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
+ // Src = G_AMDGPU_READANYLANE RALSrc
+ auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
+ if (RAL)
+ return RALSrc;
+
+ // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
+ // LoSgpr = G_AMDGPU_READANYLANE LoVgpr
+ // HiSgpr = G_AMDGPU_READANYLANE HiVgpr
+ // Src G_MERGE_VALUES LoSgpr, HiSgpr
+ auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
+ if (Merge) {
+ unsigned NumElts = Merge->getNumSources();
+ auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
+ if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
+ return {};
+
+ // Check if all elements are from same unmerge and there is no shuffling.
+ for (unsigned i = 1; i < NumElts; ++i) {
+ auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
+ if (UnmergeI != Unmerge || (unsigned)IdxI != i)
+ return {};
}
+ return Unmerge->getSourceReg();
}
- void tryCombineS1AnyExt(MachineInstr &MI) {
- // %Src:sgpr(S1) = G_TRUNC %TruncSrc
- // %Dst = G_ANYEXT %Src:sgpr(S1)
- // ->
- // %Dst = G_... %TruncSrc
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- if (MRI.getType(Src) != S1)
- return;
-
- auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
- if (!Trunc)
- return;
-
- LLT DstTy = MRI.getType(Dst);
- LLT TruncSrcTy = MRI.getType(TruncSrc);
-
- if (DstTy == TruncSrcTy) {
- MRI.replaceRegWith(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ // SrcRegIdx = G_AMDGPU_READANYLANE RALElSrc
+ // SourceReg G_MERGE_VALUES ..., SrcRegIdx, ...
+ // ..., Src, ... = G_UNMERGE_VALUES SourceReg
+ auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI);
+ if (!UnMerge)
+ return {};
+
+ int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
+ Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
+ if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources())
+ return {};
+
+ Register SrcRegIdx = Merge->getSourceReg(Idx);
+ if (MRI.getType(Src) != MRI.getType(SrcRegIdx))
+ return {};
+
+ auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE);
+ if (RALEl)
+ return RALElSrc;
+
+ return {};
+}
+
+void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst,
+ Register Src) {
+ if (Dst.isVirtual())
+ MRI.replaceRegWith(Dst, Src);
+ else
+ B.buildCopy(Dst, Src);
+}
+
+bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane(
+ MachineInstr &Copy) {
+ Register Dst = Copy.getOperand(0).getReg();
+ Register Src = Copy.getOperand(1).getReg();
+
+ // Skip non-vgpr Dst
+ if (Dst.isVirtual() ? (MRI.getRegBankOrNull(Dst) != VgprRB)
+ : !TRI.isVGPR(MRI, Dst))
+ return false;
+
+ // Skip physical source registers and source registers with register class
+ if (!Src.isVirtual() || MRI.getRegClassOrNull(Src))
+ return false;
+
+ Register RALDst = Src;
+ MachineInstr &SrcMI = *MRI.getVRegDef(Src);
+ if (SrcMI.getOpcode() == AMDGPU::G_BITCAST)
+ RALDst = SrcMI.getOperand(1).getReg();
+
+ Register RALSrc = getReadAnyLaneSrc(RALDst);
+ if (!RALSrc)
+ return false;
+
+ B.setInstr(Copy);
+ if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
+ // Src = READANYLANE RALSrc Src = READANYLANE RALSrc
+ // Dst = Copy Src $Dst = Copy Src
+ // -> ->
+ // Dst = RALSrc $Dst = Copy RALSrc
+ replaceRegWithOrBuildCopy(Dst, RALSrc);
+ } else {
+ // RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc
+ // Src = G_BITCAST RALDst Src = G_BITCAST RALDst
+ // Dst = Copy Src Dst = Copy Src
+ // -> ->
+ // NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst
+ // Dst = NewVgpr $Dst = Copy NewVgpr
+ auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
+ replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0));
+ }
+
+ eraseInstr(Copy, MRI);
+ return true;
+}
+
+void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) {
+ if (tryEliminateReadAnyLane(MI))
+ return;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ // Skip copies of physical registers.
+ if (!Dst.isVirtual() || !Src.isVirtual())
+ return;
+
+ // This is a cross bank copy, sgpr S1 to lane mask.
+ //
+ // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
+ // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
+ // ->
+ // %BoolSrc:sgpr(s32) = G_AND %TruncS32Src:sgpr(s32), 1
+ // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %BoolSrc:sgpr(s32)
+ if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
+ auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
+ assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
+ "sgpr S1 must be result of G_TRUNC of sgpr S32");
B.setInstr(MI);
+ // Ensure that truncated bits in BoolSrc are 0.
+ auto One = B.buildConstant({SgprRB, S32}, 1);
+ auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
+ B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
+ eraseInstr(MI, MRI);
+ }
+}
- if (DstTy == S32 && TruncSrcTy == S64) {
- auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
- MRI.replaceRegWith(Dst, Unmerge.getReg(0));
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) {
+ // %Src:sgpr(S1) = G_TRUNC %TruncSrc
+ // %Dst = G_ANYEXT %Src:sgpr(S1)
+ // ->
+ // %Dst = G_... %TruncSrc
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ if (MRI.getType(Src) != S1)
+ return;
+
+ auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
+ if (!Trunc)
+ return;
+
+ LLT DstTy = MRI.getType(Dst);
+ LLT TruncSrcTy = MRI.getType(TruncSrc);
+
+ if (DstTy == TruncSrcTy) {
+ MRI.replaceRegWith(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
+ }
- if (DstTy == S64 && TruncSrcTy == S32) {
- B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
- {TruncSrc, B.buildUndef({SgprRB, S32})});
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ B.setInstr(MI);
- if (DstTy == S32 && TruncSrcTy == S16) {
- B.buildAnyExt(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ if (DstTy == S32 && TruncSrcTy == S64) {
+ auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
+ MRI.replaceRegWith(Dst, Unmerge.getReg(0));
+ eraseInstr(MI, MRI);
+ return;
+ }
- if (DstTy == S16 && TruncSrcTy == S32) {
- B.buildTrunc(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ if (DstTy == S64 && TruncSrcTy == S32) {
+ B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
+ {TruncSrc, B.buildUndef({SgprRB, S32})});
+ eraseInstr(MI, MRI);
+ return;
+ }
- llvm_unreachable("missing anyext + trunc combine");
+ if (DstTy == S32 && TruncSrcTy == S16) {
+ B.buildAnyExt(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
}
-};
+
+ if (DstTy == S16 && TruncSrcTy == S32) {
+ B.buildTrunc(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
+ }
+
+ llvm_unreachable("missing anyext + trunc combine");
+}
// Search through MRI for virtual registers with sgpr register bank and S1 LLT.
[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 411159c..f471881 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -33,7 +33,7 @@ RegBankLegalizeHelper::RegBankLegalizeHelper(
MachineIRBuilder &B, const MachineUniformityInfo &MUI,
const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
: ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
- MUI(MUI), RBI(RBI), RBLRules(RBLRules),
+ MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
@@ -56,6 +56,224 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
lower(MI, Mapping, WaterfallSgprs);
}
+bool RegBankLegalizeHelper::executeInWaterfallLoop(
+ MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
+ SmallSet<Register, 4> &SGPROperandRegs) {
+ // Track use registers which have already been expanded with a readfirstlane
+ // sequence. This may have multiple uses if moving a sequence.
+ DenseMap<Register, Register> WaterfalledRegMap;
+
+ MachineBasicBlock &MBB = B.getMBB();
+ MachineFunction &MF = B.getMF();
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
+ unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
+ if (IsWave32) {
+ MovExecOpc = AMDGPU::S_MOV_B32;
+ MovExecTermOpc = AMDGPU::S_MOV_B32_term;
+ XorTermOpc = AMDGPU::S_XOR_B32_term;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
+ ExecReg = AMDGPU::EXEC_LO;
+ } else {
+ MovExecOpc = AMDGPU::S_MOV_B64;
+ MovExecTermOpc = AMDGPU::S_MOV_B64_term;
+ XorTermOpc = AMDGPU::S_XOR_B64_term;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
+ ExecReg = AMDGPU::EXEC;
+ }
+
+#ifndef NDEBUG
+ const int OrigRangeSize = std::distance(Range.begin(), Range.end());
+#endif
+
+ MachineRegisterInfo &MRI = *B.getMRI();
+ Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
+ Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
+
+ // Don't bother using generic instructions/registers for the exec mask.
+ B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
+
+ Register SavedExec = MRI.createVirtualRegister(WaveRC);
+
+ // To insert the loop we need to split the block. Move everything before
+ // this point to a new block, and insert a new empty block before this
+ // instruction.
+ MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+ MF.insert(MBBI, LoopBB);
+ MF.insert(MBBI, BodyBB);
+ MF.insert(MBBI, RestoreExecBB);
+ MF.insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(BodyBB);
+ BodyBB->addSuccessor(RestoreExecBB);
+ BodyBB->addSuccessor(LoopBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
+
+ MBB.addSuccessor(LoopBB);
+ RestoreExecBB->addSuccessor(RemainderBB);
+
+ B.setInsertPt(*LoopBB, LoopBB->end());
+
+ // +-MBB:------------+
+ // | ... |
+ // | %0 = G_INST_1 |
+ // | %Dst = MI %Vgpr |
+ // | %1 = G_INST_2 |
+ // | ... |
+ // +-----------------+
+ // ->
+ // +-MBB-------------------------------+
+ // | ... |
+ // | %0 = G_INST_1 |
+ // | %SaveExecReg = S_MOV_B32 $exec_lo |
+ // +----------------|------------------+
+ // | /------------------------------|
+ // V V |
+ // +-LoopBB---------------------------------------------------------------+ |
+ // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
+ // | instead of executing for each lane, see if other lanes had | |
+ // | same value for %Vgpr and execute for them also. | |
+ // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
+ // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
+ // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
+ // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
+ // +----------------|-----------------------------------------------------+ |
+ // V |
+ // +-BodyBB------------------------------------------------------------+ |
+ // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
+ // | executed only for active lanes and written to Dst | |
+ // | $exec = S_XOR_B32 $exec, %SavedExec | |
+ // | set active lanes to 0 in SavedExec, lanes that did not write to | |
+ // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
+ // | SI_WATERFALL_LOOP LoopBB |-----|
+ // +----------------|--------------------------------------------------+
+ // V
+ // +-RestoreExecBB--------------------------+
+ // | $exec_lo = S_MOV_B32_term %SaveExecReg |
+ // +----------------|-----------------------+
+ // V
+ // +-RemainderBB:----------------------+
+ // | %1 = G_INST_2 |
+ // | ... |
+ // +---------------------------------- +
+
+ // Move the instruction into the loop body. Note we moved everything after
+ // Range.end() already into a new block, so Range.end() is no longer valid.
+ BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
+
+ // Figure out the iterator range after splicing the instructions.
+ MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
+ auto NewEnd = BodyBB->end();
+ assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
+
+ B.setMBB(*LoopBB);
+ Register CondReg;
+
+ for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
+ for (MachineOperand &Op : MI.all_uses()) {
+ Register OldReg = Op.getReg();
+ if (!SGPROperandRegs.count(OldReg))
+ continue;
+
+ // See if we already processed this register in another instruction in
+ // the sequence.
+ auto OldVal = WaterfalledRegMap.find(OldReg);
+ if (OldVal != WaterfalledRegMap.end()) {
+ Op.setReg(OldVal->second);
+ continue;
+ }
+
+ Register OpReg = Op.getReg();
+ LLT OpTy = MRI.getType(OpReg);
+
+ // TODO: support for agpr
+ assert(MRI.getRegBank(OpReg) == VgprRB);
+ Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
+ buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
+
+ // Build the comparison(s), CurrentLaneReg == OpReg.
+ unsigned OpSize = OpTy.getSizeInBits();
+ unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
+ LLT PartTy = LLT::scalar(PartSize);
+ unsigned NumParts = OpSize / PartSize;
+ SmallVector<Register, 8> OpParts;
+ SmallVector<Register, 8> CurrentLaneParts;
+
+ if (NumParts == 1) {
+ OpParts.push_back(OpReg);
+ CurrentLaneParts.push_back(CurrentLaneReg);
+ } else {
+ auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
+ auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
+ for (unsigned i = 0; i < NumParts; ++i) {
+ OpParts.push_back(UnmergeOp.getReg(i));
+ CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
+ }
+ }
+
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
+ B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
+
+ if (!CondReg)
+ CondReg = CmpReg;
+ else
+ CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
+ }
+
+ Op.setReg(CurrentLaneReg);
+
+ // Make sure we don't re-process this register again.
+ WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
+ }
+ }
+
+ // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
+ Register CondRegLM =
+ MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
+ B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
+
+ // Update EXEC, save the original EXEC value to SavedExec.
+ B.buildInstr(AndSaveExecOpc)
+ .addDef(SavedExec)
+ .addReg(CondRegLM, RegState::Kill);
+ MRI.setSimpleHint(SavedExec, CondRegLM);
+
+ B.setInsertPt(*BodyBB, BodyBB->end());
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+ B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
+
+ // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
+ // s_cbranch_scc0?
+
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
+ B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
+
+ // Save the EXEC mask before the loop.
+ B.setInsertPt(MBB, MBB.end());
+ B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
+
+ // Restore the EXEC mask after the loop.
+ B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
+ B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
+
+ // Set the insert point after the original instruction, so any new
+ // instructions will be in the remainder.
+ B.setInsertPt(*RemainderBB, RemainderBB->begin());
+
+ return true;
+}
+
void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
MachineFunction &MF = B.getMF();
@@ -391,7 +609,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
switch (Mapping.LoweringMethod) {
case DoNotLower:
- return;
+ break;
case VccExtToSel:
return lowerVccExtToSel(MI);
case UniExtToSel: {
@@ -527,7 +745,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
}
}
- // TODO: executeInWaterfallLoop(... WaterfallSgprs)
+ if (!WaterfallSgprs.empty()) {
+ MachineBasicBlock::iterator I = MI.getIterator();
+ executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
+ }
}
LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
@@ -539,6 +760,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case Vgpr16:
return LLT::scalar(16);
case Sgpr32:
+ case Sgpr32_WF:
case Sgpr32Trunc:
case Sgpr32AExt:
case Sgpr32AExtBoolInReg:
@@ -577,6 +799,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case VgprV2S32:
return LLT::fixed_vector(2, 32);
case SgprV4S32:
+ case SgprV4S32_WF:
case VgprV4S32:
case UniInVgprV4S32:
return LLT::fixed_vector(4, 32);
@@ -650,6 +873,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
return VccRB;
case Sgpr16:
case Sgpr32:
+ case Sgpr32_WF:
case Sgpr64:
case Sgpr128:
case SgprP1:
@@ -662,6 +886,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprV2S16:
case SgprV2S32:
case SgprV4S32:
+ case SgprV4S32_WF:
case SgprB32:
case SgprB64:
case SgprB96:
@@ -923,6 +1148,14 @@ void RegBankLegalizeHelper::applyMappingSrc(
}
break;
}
+ // sgpr waterfall, scalars and vectors
+ case Sgpr32_WF:
+ case SgprV4S32_WF: {
+ assert(Ty == getTyFromID(MethodIDs[i]));
+ if (RB != SgprRB)
+ SgprWaterfallOperandRegs.insert(Reg);
+ break;
+ }
// sgpr and vgpr scalars with extend
case Sgpr32AExt: {
// Note: this ext allows S1, and it is meant to be combined away.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index 08cc7d4..db965d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -32,6 +32,7 @@ class RegBankLegalizeHelper {
const MachineUniformityInfo &MUI;
const RegisterBankInfo &RBI;
const RegBankLegalizeRules &RBLRules;
+ const bool IsWave32;
const RegisterBank *SgprRB;
const RegisterBank *VgprRB;
const RegisterBank *VccRB;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index a60855c..5a6ad40 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -529,7 +529,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_ICMP})
.Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
- .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}});
+ .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
+ .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
addRulesForGOpcs({G_FCMP})
.Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}})
@@ -666,11 +667,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
// clang-format off
addRulesForGOpcs({G_LOAD})
.Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}})
+ .Any({{DivB32, UniP0}, {{VgprB32}, {VgprP0}}})
.Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
.Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
.Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
.Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}})
+ .Any({{{UniB64, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
+ .Any({{{UniB96, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
+ .Any({{{UniB128, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
.Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
.Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})
@@ -684,6 +689,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnalignedLoads)
.Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnalignedLoads)
.Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnalignedLoads)
+ .Any({{{UniB128, UniP4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
.Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
.Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
.Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load
@@ -698,11 +704,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}});
// clang-format on
- addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector)
- .Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}});
+ addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, StandardB)
+ .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B96, {{VgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B96, {{UniInVgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
addRulesForGOpcs({G_STORE})
.Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}})
@@ -716,7 +726,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_PTR_ADD})
.Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
.Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
- .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}});
+ .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}})
+ .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}});
addRulesForGOpcs({G_INTTOPTR})
.Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 7243d75..1391440 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -188,7 +188,11 @@ enum RegBankLLTMappingApplyID {
Sgpr32Trunc,
- // Src only modifiers: waterfalls, extends
+ // Src only modifiers: execute in waterfall loop if divergent
+ Sgpr32_WF,
+ SgprV4S32_WF,
+
+ // Src only modifiers: extends
Sgpr32AExt,
Sgpr32AExtBoolInReg,
Sgpr32SExt,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index bf2f37b..c5a1d9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2528,7 +2528,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// Special case for s_mul_u64. There is not a vector equivalent of
// s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
// multiplications.
- if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
+ if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL &&
+ DstTy.getSizeInBits() == 64) {
applyMappingSMULU64(B, OpdMapper);
return;
}
@@ -3500,19 +3501,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyMappingMAD_64_32(B, OpdMapper);
return;
case AMDGPU::G_PREFETCH: {
- if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) {
+ if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) {
MI.eraseFromParent();
return;
}
Register PtrReg = MI.getOperand(0).getReg();
unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
- if (PtrBank == AMDGPU::VGPRRegBankID) {
+ if (PtrBank == AMDGPU::VGPRRegBankID &&
+ (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(3).getImm())) {
+ // Cannot do I$ prefetch with divergent pointer.
MI.eraseFromParent();
return;
}
unsigned AS = MRI.getType(PtrReg).getAddressSpace();
- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
- AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ if ((!AMDGPU::isFlatGlobalAddrSpace(AS) &&
+ AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
+ (!Subtarget.hasSafeSmemPrefetch() &&
+ (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ !MI.getOperand(3).getImm() /* I$ prefetch */))) {
MI.eraseFromParent();
return;
}
@@ -3973,7 +3979,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
} else {
- OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
+ if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64())
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ else
+ OpdsMapping[0] =
+ getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
@@ -4714,6 +4724,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8:
case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
@@ -5169,6 +5180,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_ds_load_tr16_b128:
case Intrinsic::amdgcn_ds_load_tr4_b64:
case Intrinsic::amdgcn_ds_load_tr6_b96:
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
case Intrinsic::amdgcn_ds_read_tr4_b64:
case Intrinsic::amdgcn_ds_read_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr8_b64:
@@ -5431,6 +5448,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch:
+ return getDefaultMappingVOP(MI);
default:
return getInvalidInstructionMapping();
}
@@ -5540,6 +5560,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_PREFETCH:
OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
break;
+ case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP:
+ case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+ break;
}
return getInstructionMapping(/*ID*/1, /*Cost*/1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index a8e1967..f580f43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -159,7 +159,8 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
// If the inputs are tied and the same register, we can shortcut and
// directly replace the register.
- if (Src2->getReg() != CopySrcReg) {
+ if (!Src2->isReg() || Src2->getReg() != CopySrcReg ||
+ Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) {
LLVM_DEBUG(
dbgs()
<< "Replacing untied VGPR MFMAs with AGPR form not yet handled\n");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 1e44be8..6878744 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -61,6 +61,7 @@ protected:
bool EnableRealTrue16Insts = false;
bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
+ bool HasBF16PackedInsts = false;
bool HasMadMixInsts = false;
bool HasMadMacF32Insts = false;
bool HasDsSrc2Insts = false;
@@ -209,6 +210,8 @@ public:
return HasBF16ConversionInsts;
}
+ bool hasBF16PackedInsts() const { return HasBF16PackedInsts; }
+
bool hasMadMixInsts() const {
return HasMadMixInsts;
}
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index de17fcc..421fc42 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -176,6 +176,8 @@ public:
ImmTyWaitVAVDst,
ImmTyWaitVMVSrc,
ImmTyBitOp3,
+ ImmTyMatrixAFMT,
+ ImmTyMatrixBFMT,
ImmTyMatrixAReuse,
ImmTyMatrixBReuse,
ImmTyByteSel,
@@ -423,6 +425,8 @@ public:
bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); }
+ bool isMatrixAFMT() const { return isImmTy(ImmTyMatrixAFMT); }
+ bool isMatrixBFMT() const { return isImmTy(ImmTyMatrixBFMT); }
bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); }
bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
@@ -1174,6 +1178,8 @@ public:
case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
case ImmTyBitOp3: OS << "BitOp3"; break;
+ case ImmTyMatrixAFMT: OS << "ImmTyMatrixAFMT"; break;
+ case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break;
case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break;
case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break;
case ImmTyByteSel: OS << "ByteSel" ; break;
@@ -1714,6 +1720,10 @@ public:
ParseStatus parseIndexKey8bit(OperandVector &Operands);
ParseStatus parseIndexKey16bit(OperandVector &Operands);
ParseStatus parseIndexKey32bit(OperandVector &Operands);
+ ParseStatus tryParseMatrixFMT(OperandVector &Operands, StringRef Name,
+ AMDGPUOperand::ImmTy Type);
+ ParseStatus parseMatrixAFMT(OperandVector &Operands);
+ ParseStatus parseMatrixBFMT(OperandVector &Operands);
ParseStatus parseDfmtNfmt(int64_t &Format);
ParseStatus parseUfmt(int64_t &Format);
@@ -1849,6 +1859,7 @@ private:
const unsigned CPol);
bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
std::optional<StringRef> validateLdsDirect(const MCInst &Inst);
+ bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);
unsigned getConstantBusLimit(unsigned Opcode) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
@@ -5128,13 +5139,45 @@ bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const {
bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const {
auto FB = getFeatureBits();
+ if (!FB[AMDGPU::FeatureGFX90AInsts] && !FB[AMDGPU::FeatureGFX1250Insts])
+ return true;
+
unsigned Opc = Inst.getOpcode();
+ const MCRegisterInfo *MRI = getMRI();
// DS_READ_B96_TR_B6 is the only DS instruction in GFX950, that allows
// unaligned VGPR. All others only allow even aligned VGPRs.
- if (!(FB[AMDGPU::FeatureGFX90AInsts]) || Opc == AMDGPU::DS_READ_B96_TR_B6_vi)
+ if (FB[AMDGPU::FeatureGFX90AInsts] && Opc == AMDGPU::DS_READ_B96_TR_B6_vi)
return true;
- const MCRegisterInfo *MRI = getMRI();
+ if (FB[AMDGPU::FeatureGFX1250Insts]) {
+ switch (Opc) {
+ default:
+ break;
+ case AMDGPU::DS_LOAD_TR6_B96:
+ case AMDGPU::DS_LOAD_TR6_B96_gfx12:
+ // DS_LOAD_TR6_B96 is the only DS instruction in GFX1250, that
+ // allows unaligned VGPR. All others only allow even aligned VGPRs.
+ return true;
+ case AMDGPU::GLOBAL_LOAD_TR6_B96:
+ case AMDGPU::GLOBAL_LOAD_TR6_B96_gfx1250: {
+ // GLOBAL_LOAD_TR6_B96 is the only GLOBAL instruction in GFX1250, that
+ // allows unaligned VGPR for vdst, but other operands still only allow
+ // even aligned VGPRs.
+ int VAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
+ if (VAddrIdx != -1) {
+ const MCOperand &Op = Inst.getOperand(VAddrIdx);
+ MCRegister Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+ if ((Sub - AMDGPU::VGPR0) & 1)
+ return false;
+ }
+ return true;
+ }
+ case AMDGPU::GLOBAL_LOAD_TR6_B96_SADDR:
+ case AMDGPU::GLOBAL_LOAD_TR6_B96_SADDR_gfx1250:
+ return true;
+ }
+ }
+
const MCRegisterClass &VGPR32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID);
const MCRegisterClass &AGPR32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID);
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
@@ -5281,6 +5324,12 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
unsigned CPol = Inst.getOperand(CPolPos).getImm();
if (!isGFX1250()) {
+ if (CPol & CPol::SCAL) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ StringRef CStr(S.getPointer());
+ S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
+ Error(S, "scale_offset is not supported on this GPU");
+ }
if (CPol & CPol::NV) {
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
StringRef CStr(S.getPointer());
@@ -5289,6 +5338,13 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
}
}
+ if ((CPol & CPol::SCAL) && !supportsScaleOffset(MII, Inst.getOpcode())) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ StringRef CStr(S.getPointer());
+ S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
+ Error(S, "scale_offset is not supported for this instruction");
+ }
+
if (isGFX12Plus())
return validateTHAndScopeBits(Inst, Operands, CPol);
@@ -5409,6 +5465,37 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst,
return true;
}
+bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
+ const OperandVector &Operands) {
+ unsigned Opc = Inst.getOpcode();
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ auto validateFmt = [&](AMDGPU::OpName FmtOp, AMDGPU::OpName SrcOp) -> bool {
+ int FmtIdx = AMDGPU::getNamedOperandIdx(Opc, FmtOp);
+ if (FmtIdx == -1)
+ return true;
+ unsigned Fmt = Inst.getOperand(FmtIdx).getImm();
+ int SrcIdx = AMDGPU::getNamedOperandIdx(Opc, SrcOp);
+ unsigned RegSize =
+ TRI->getRegClass(Desc.operands()[SrcIdx].RegClass).getSizeInBits();
+
+ if (RegSize == AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt) * 32)
+ return true;
+
+ static const char *FmtNames[] = {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
+ "MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
+ "MATRIX_FMT_FP4"};
+
+ Error(getRegLoc(mc2PseudoReg(Inst.getOperand(SrcIdx).getReg()), Operands),
+ "wrong register tuple size for " + Twine(FmtNames[Fmt]));
+ return false;
+ };
+
+ return validateFmt(AMDGPU::OpName::matrix_a_fmt, AMDGPU::OpName::src0) &&
+ validateFmt(AMDGPU::OpName::matrix_b_fmt, AMDGPU::OpName::src1);
+}
+
bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
const SMLoc &IDLoc,
const OperandVector &Operands) {
@@ -5542,6 +5629,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
if (!validateTFE(Inst, Operands)) {
return false;
}
+ if (!validateWMMA(Inst, Operands)) {
+ return false;
+ }
return true;
}
@@ -6926,6 +7016,7 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
ParseStatus ResTH = ParseStatus::NoMatch;
ParseStatus ResScope = ParseStatus::NoMatch;
ParseStatus ResNV = ParseStatus::NoMatch;
+ ParseStatus ResScal = ParseStatus::NoMatch;
for (;;) {
if (ResTH.isNoMatch()) {
@@ -6964,10 +7055,22 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
}
}
+ if (ResScal.isNoMatch()) {
+ if (trySkipId("scale_offset")) {
+ ResScal = ParseStatus::Success;
+ CPolVal |= CPol::SCAL;
+ continue;
+ } else if (trySkipId("no", "scale_offset")) {
+ ResScal = ParseStatus::Success;
+ continue;
+ }
+ }
+
break;
}
- if (ResTH.isNoMatch() && ResScope.isNoMatch() && ResNV.isNoMatch())
+ if (ResTH.isNoMatch() && ResScope.isNoMatch() && ResNV.isNoMatch() &&
+ ResScal.isNoMatch())
return ParseStatus::NoMatch;
Operands.push_back(AMDGPUOperand::CreateImm(this, CPolVal, StringLoc,
@@ -7215,6 +7318,26 @@ ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) {
return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey32bit);
}
+ParseStatus AMDGPUAsmParser::tryParseMatrixFMT(OperandVector &Operands,
+ StringRef Name,
+ AMDGPUOperand::ImmTy Type) {
+ return parseStringOrIntWithPrefix(Operands, Name,
+ {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
+ "MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
+ "MATRIX_FMT_FP4"},
+ Type);
+}
+
+ParseStatus AMDGPUAsmParser::parseMatrixAFMT(OperandVector &Operands) {
+ return tryParseMatrixFMT(Operands, "matrix_a_fmt",
+ AMDGPUOperand::ImmTyMatrixAFMT);
+}
+
+ParseStatus AMDGPUAsmParser::parseMatrixBFMT(OperandVector &Operands) {
+ return tryParseMatrixFMT(Operands, "matrix_b_fmt",
+ AMDGPUOperand::ImmTyMatrixBFMT);
+}
+
// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
// values to live in a joint format operand in the MCInst encoding.
ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
@@ -9316,6 +9439,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
DefaultVal);
}
+ int MatrixAFMTIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_fmt);
+ if (MatrixAFMTIdx != -1) {
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyMatrixAFMT, 0);
+ }
+
+ int MatrixBFMTIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_fmt);
+ if (MatrixBFMTIdx != -1) {
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyMatrixBFMT, 0);
+ }
+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse))
addOptionalImmOperand(Inst, Operands, OptIdx,
AMDGPUOperand::ImmTyMatrixAReuse, 0);
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index e994aee..f99e716 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1488,7 +1488,6 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3f32, "BUFFER_STORE_FORMAT_XYZ">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMAT_XYZ">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index e219fe0..319cc9d 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -886,7 +886,6 @@ defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "extloadi8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "zextloadi8_local">;
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
-defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">;
defm : DSReadPat_t16 <DS_READ_I8, i16, "sextloadi8_local">;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 98f7e17..5c1989b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -877,6 +877,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsMAI)
convertMAIInst(MI);
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsWMMA)
+ convertWMMAInst(MI);
+
int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vdst_in);
if (VDstIn_Idx != -1) {
@@ -974,10 +977,23 @@ static void adjustMFMA_F8F6F4OpRegClass(const MCRegisterInfo &MRI,
return MO.setReg(
MRI.getSubReg(MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5));
case 8:
+ if (MCRegister NewReg = MRI.getSubReg(
+ MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7)) {
+ MO.setReg(NewReg);
+ }
+ return;
+ case 12: {
+ // There is no 384-bit subreg index defined.
+ MCRegister BaseReg = MRI.getSubReg(MO.getReg(), AMDGPU::sub0);
+ MCRegister NewReg = MRI.getMatchingSuperReg(
+ BaseReg, AMDGPU::sub0, &MRI.getRegClass(AMDGPU::VReg_384RegClassID));
+ return MO.setReg(NewReg);
+ }
+ case 16:
// No-op in cases where one operand is still f8/bf8.
return;
default:
- llvm_unreachable("Unexpected size for mfma f8f6f4 operand");
+ llvm_unreachable("Unexpected size for mfma/wmma f8f6f4 operand");
}
}
@@ -1015,6 +1031,35 @@ void AMDGPUDisassembler::convertMAIInst(MCInst &MI) const {
AdjustedRegClassOpcode->NumRegsSrcB);
}
+void AMDGPUDisassembler::convertWMMAInst(MCInst &MI) const {
+ int FmtAIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::matrix_a_fmt);
+ if (FmtAIdx == -1)
+ return;
+
+ int FmtBIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::matrix_b_fmt);
+
+ unsigned FmtA = MI.getOperand(FmtAIdx).getImm();
+ unsigned FmtB = MI.getOperand(FmtBIdx).getImm();
+
+ const AMDGPU::MFMA_F8F6F4_Info *AdjustedRegClassOpcode =
+ AMDGPU::getWMMA_F8F6F4_WithFormatArgs(FmtA, FmtB, MI.getOpcode());
+ if (!AdjustedRegClassOpcode ||
+ AdjustedRegClassOpcode->Opcode == MI.getOpcode())
+ return;
+
+ MI.setOpcode(AdjustedRegClassOpcode->Opcode);
+ int Src0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+ int Src1Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
+ adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src0Idx),
+ AdjustedRegClassOpcode->NumRegsSrcA);
+ adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src1Idx),
+ AdjustedRegClassOpcode->NumRegsSrcB);
+}
+
struct VOPModifiers {
unsigned OpSel = 0;
unsigned OpSelHi = 0;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 8404100..f4d164b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -161,6 +161,7 @@ public:
void convertFMAanyK(MCInst &MI) const;
void convertSDWAInst(MCInst &MI) const;
void convertMAIInst(MCInst &MI) const;
+ void convertWMMAInst(MCInst &MI) const;
void convertDPP8Inst(MCInst &MI) const;
void convertMIMGInst(MCInst &MI) const;
void convertVOP3DPPInst(MCInst &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index f7f29f1..7207c25 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -13,8 +13,9 @@ let WantsRoot = true in {
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
+ def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
- def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>;
+ def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>;
}
class True16D16Table <string hiOp, string loOp> {
@@ -464,6 +465,37 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n
let sve = 0;
}
+class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> :
+ FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> {
+ let has_vdst = 0;
+ let has_data = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let VM_CNT = 0;
+ let LGKM_CNT = 0;
+}
+
+multiclass FLAT_Flat_Prefetch_Pseudo<string opName> {
+ def "" : FLAT_Prefetch_Pseudo<opName>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+ GlobalSaddrTable<1, opName> {
+ let OtherPredicates = [HasFlatGVSMode];
+ let enabled_saddr = 1;
+ }
+}
+
+multiclass FLAT_Global_Prefetch_Pseudo<string opName> {
+ let is_flat_global = 1, has_saddr = 1 in {
+ def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+ GlobalSaddrTable<1, opName> {
+ let enabled_saddr = 1;
+ }
+ }
+}
+
class FlatScratchInst <string sv_op, string mode> {
string SVOp = sv_op;
string Mode = mode;
@@ -1162,6 +1194,16 @@ defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_u
defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">;
defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">;
+let SubtargetPredicate = isGFX125xOnly in {
+defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPR_32>;
+defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VReg_64>;
+defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VReg_128>;
+
+defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPR_32>;
+defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VReg_64>;
+defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>;
+} // End SubtargetPredicate = isGFX125xOnly
+
let SubtargetPredicate = isGFX12Plus in {
let Uses = [EXEC, M0] in {
defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>;
@@ -1218,6 +1260,11 @@ let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
"global_atomic_pk_add_f16", VGPR_32, v2f16
>;
+let SubtargetPredicate = HasVmemPrefInsts in {
+ defm FLAT_PREFETCH_B8 : FLAT_Flat_Prefetch_Pseudo<"flat_prefetch_b8">;
+ defm GLOBAL_PREFETCH_B8 : FLAT_Global_Prefetch_Pseudo<"global_prefetch_b8">;
+}
+
//===----------------------------------------------------------------------===//
// Flat Patterns
//===----------------------------------------------------------------------===//
@@ -1228,6 +1275,11 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN
(inst $vaddr, $offset)
>;
+class FlatLoadPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (FlatOffset i64:$vaddr, i32:$offset), (i32 timm:$cpol))),
+ (inst $vaddr, $offset, $cpol)
+>;
+
class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
@@ -1249,8 +1301,8 @@ class FlatSignedLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Value
>;
class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)),
- (inst $saddr, $voffset, $offset, 0, $in)
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
+ (inst $saddr, $voffset, $offset, $cpol, $in)
>;
class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1264,8 +1316,8 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
>;
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
- (inst $saddr, $voffset, $offset, (i32 0))
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (inst $saddr, $voffset, $offset, $cpol)
>;
class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1278,6 +1330,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))),
+ (inst $vaddr, $offset, $cpol)
+>;
+
+class GlobalLoadSaddrPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))),
+ (inst $saddr, $voffset, $offset, $cpol)
+>;
+
class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
(node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)),
@@ -1443,24 +1505,24 @@ class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
>;
class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))),
- (inst $vaddr, $saddr, $offset, 0)
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+ (inst $vaddr, $saddr, $offset, $cpol)
>;
class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset)),
- (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset)
+ (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)),
+ (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset, $cpol)
>;
class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset), vt:$in)),
- (inst $vaddr, $saddr, $offset, 0, $in)
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol), vt:$in)),
+ (inst $vaddr, $saddr, $offset, $cpol, $in)
>;
class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))),
- (inst $vaddr, $saddr, $offset, 0)
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+ (inst $vaddr, $saddr, $offset, $cpol)
>;
multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
@@ -1473,6 +1535,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
}
}
+multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadSignedPat_CPOL<inst, node, vt> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalLoadSaddrPat_CPOL<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 11;
+ }
+}
+
multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatSignedLoadPat_D16 <inst, node, vt> {
let AddedComplexity = 10;
@@ -2009,6 +2081,16 @@ let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts]
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>;
}
+let OtherPredicates = [isGFX125xOnly] in {
+ def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B32, int_amdgcn_flat_load_monitor_b32, i32>;
+ def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B64, int_amdgcn_flat_load_monitor_b64, v2i32>;
+ def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B128, int_amdgcn_flat_load_monitor_b128, v4i32>;
+
+ defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B32, int_amdgcn_global_load_monitor_b32, i32>;
+ defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B64, int_amdgcn_global_load_monitor_b64, v2i32>;
+ defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>;
+} // End SubtargetPredicate = isGFX125xOnly
+
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
@@ -2138,6 +2220,77 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f
} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
+def PrefetchLoc: SDNodeXForm<timm, [{
+ uint32_t V = N->getZExtValue();
+ V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) << AMDGPU::CPol::SCOPE_SHIFT;
+ if (!Subtarget->hasSafeCUPrefetch())
+ V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
+ return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
+}]>;
+
+def prefetch_flat : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+ (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+ [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; }]> {
+ let GISelPredicateCode = [{
+ return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
+ }];
+}
+
+def prefetch_global : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+ (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+ [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ (cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ !Subtarget->hasSafeSmemPrefetch()); }]> {
+ let GISelPredicateCode = [{
+ return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ ((*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ !Subtarget->hasSafeSmemPrefetch());
+ }];
+}
+
+multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatternOperator rw> {
+ def : GCNPat <
+ (prefetch_kind (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
+ (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, (i32 (PrefetchLoc $loc)))
+ > {
+ let AddedComplexity = !if(!eq(rw, i32imm_zero), 0, 25);
+ }
+
+ def : GCNPat <
+ (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
+ (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc)))
+ > {
+ let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30);
+ }
+}
+
+multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
+ def : GCNPat <
+ (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol)
+ >;
+
+ def : GCNPat <
+ (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> {
+ let AddedComplexity = 11;
+ }
+}
+
+let SubtargetPredicate = HasVmemPrefInsts in {
+ defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_zero>;
+ defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_zero>;
+
+ // Patterns for forced vector prefetch with rw = 1.
+ defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_one>;
+ defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_one>;
+
+
+ // Patterns for target intrinsics
+ defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>;
+ defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>;
+} // End SubtargetPredicate = HasVmemPrefInsts
+
//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
@@ -2941,6 +3094,7 @@ multiclass VFLAT_Real_gfx12 <bits<8> op, string name = get_FLAT_ps<NAME>.Mnemoni
let DecoderNamespace = "GFX12";
let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
+ let Inst{48} = cpol{CPolBit.SCAL}; // scale offset
}
}
@@ -3170,6 +3324,7 @@ multiclass VFLAT_Real_gfx1250<bits<8> op,
let DecoderNamespace = "GFX1250";
let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
+ let Inst{48} = cpol{CPolBit.SCAL}; // scale offset
}
}
@@ -3208,6 +3363,17 @@ multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME
defm TENSOR_SAVE : VFLAT_Real_gfx1250<0x06e>;
defm TENSOR_STOP : VFLAT_Real_gfx1250<0x06f>;
+defm FLAT_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>;
+defm GLOBAL_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>;
+
+defm FLAT_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
+defm FLAT_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
+defm FLAT_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
+
+defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
+defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
+defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
+
defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">;
defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index bbed828..94886b0 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -520,8 +520,8 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
const MachineInstr *MI, IsExpiredFn IsExpired) {
DenseSet<const MachineBasicBlock *> Visited;
return getWaitStatesSince(IsHazard, MI->getParent(),
- std::next(MI->getReverseIterator()),
- 0, IsExpired, Visited);
+ std::next(MI->getReverseIterator()), 0, IsExpired,
+ Visited, SIInstrInfo::getNumWaitStates);
}
int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
@@ -1190,7 +1190,8 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVALUPartialForwardingHazard(MI);
fixVALUTransUseHazard(MI);
fixVALUTransCoexecutionHazards(MI);
- fixWMMAHazards(MI);
+ fixWMMAHazards(MI); // fall-through if co-execution is enabled.
+ fixWMMACoexecutionHazards(MI);
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
fixRequiredExportPriority(MI);
@@ -1909,6 +1910,182 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
return true;
}
+static bool isCoexecutableVALUInst(const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isTRANS(MI) &&
+ !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
+}
+
+static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
+ const SIInstrInfo *TII, unsigned Latency,
+ unsigned Category) {
+ assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
+ "Handle me if the xdl wmma instruction latency changes");
+
+ switch (Category) {
+ case 0: // Dense WMMA Instructions:
+ // WMMA_*F16, WMMA_*BF16
+ // WMMA_*FP8FP8
+ // WMMA_*FP8BF8
+ // WMMA_*BF8FP8
+ // WMMA_*BF8BF8
+ // WMMA_*F8F6F4 if SRCA & SRCB != F8
+ return Latency == 8 && SIInstrInfo::isWMMA(MI);
+
+ case 1: // Dense WMMA Instructions:
+ // WMMA_IU8
+ // WMMA_IU4
+ // WMMA_*F8F6F4 if SRCA OR SRCB == F8
+ return Latency == 16 && SIInstrInfo::isWMMA(MI);
+
+ case 2: // Dense SWMMAC Instructions
+ // SWMMAC_*F16, SWMMAC_*BF16,
+ // SWMMAC_*FP8FP8
+ // SWMMAC_*BF8FP8
+ // SWMMAC_*FP8BF8
+ // SWMMAC_*BF8BF8
+ return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
+
+ case 3: // Sparse WMMA Instructions:
+ // SWMMAC_IU8
+ // SWMMAC_IU4
+ return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
+ default:
+ break;
+ } // end switch.
+
+ return false;
+}
+
+bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
+ if (!AMDGPU::isGFX1250(ST))
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
+ // be in between the first WMMA and the second instruction to cover the hazard
+ // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
+ // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
+ // numbers, which depends on the category of the first WMMA.
+ const int WMMAWaitStates[] = {5, 9, 3, 5};
+ const int VALUWaitStates[] = {4, 8, 2, 4};
+ unsigned Category = 0;
+
+ auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+ if (!TII->isXDLWMMA(I))
+ return false;
+
+ unsigned Latency = TSchedModel.computeInstrLatency(&I);
+ if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
+ return false;
+
+ Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+ Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
+ Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
+
+ // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
+ if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
+ return true;
+
+ if (SIInstrInfo::isSWMMAC(*MI)) {
+ Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+ if (TRI->regsOverlap(D0, Idx1))
+ return true;
+ }
+
+ return false;
+ };
+
+ auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+ if (!TII->isXDLWMMA(I))
+ return false;
+
+ unsigned Latency = TSchedModel.computeInstrLatency(&I);
+ if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
+ return false;
+
+ // WMMA writes, VALU reads.
+ Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+ for (const MachineOperand &ValuUse : MI->explicit_uses()) {
+ if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
+ return true;
+ }
+
+ auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
+ if (!ValuDst || !ValuDst->isReg())
+ return false;
+ Register D1 = ValuDst->getReg();
+
+ // WMMA writes, VALU writes.
+ if (TRI->regsOverlap(D0, D1))
+ return true;
+
+ // WMMA reads, VALU writes.
+ Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
+ Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
+ if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
+ return true;
+
+ if (SIInstrInfo::isSWMMAC(I)) {
+ Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
+ if (TRI->regsOverlap(D1, Idx0))
+ return true;
+ }
+
+ return false;
+ };
+
+ int Limit = 0;
+ auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) {
+ return WaitStates >= Limit;
+ };
+
+ auto GetWaitStatesFn = [](const MachineInstr &I) {
+ return SIInstrInfo::isVALU(I) ? 1 : 0;
+ };
+
+ int WaitStatesNeeded = -1;
+ if (TII->isXDLWMMA(*MI)) {
+ for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
+ Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
+ DenseSet<const MachineBasicBlock *> Visited;
+ // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // exists, and INT_MAX if there is no hazard. As a result, a negative
+ // WaitStatesNeeded here means no hazard, and we will continue to search
+ // for other categories.
+ WaitStatesNeeded =
+ Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(),
+ std::next(MI->getReverseIterator()), 0,
+ IsExpiredFn, Visited, GetWaitStatesFn);
+ }
+ } else { // Must be a co-executable VALU.
+ for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
+ Limit = VALUWaitStates[Category]; // for IsExpiredFn.
+ DenseSet<const MachineBasicBlock *> Visited;
+ // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // exists, and INT_MAX if there is no hazard. As a result, a negative
+ // WaitStatesNeeded here means no hazard, and we will continue to search
+ // for other categories.
+ WaitStatesNeeded =
+ Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(),
+ std::next(MI->getReverseIterator()), 0,
+ IsExpiredFn, Visited, GetWaitStatesFn);
+ }
+ }
+
+ // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative
+ // means not needed.
+ for (int i = 0; i < WaitStatesNeeded; i++)
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_NOP_e32));
+
+ return true;
+}
+
bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
if (!ST.hasShift64HighRegBug())
return false;
@@ -3206,7 +3383,7 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
// Check entry priority at each export (as there will only be a few).
// Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
bool Changed = false;
- if (CC != CallingConv::AMDGPU_Gfx)
+ if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave)
Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
auto NextMI = std::next(It);
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index ef6ddd8..f796eeae 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -106,6 +106,7 @@ private:
bool fixVALUTransUseHazard(MachineInstr *MI);
bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
bool fixWMMAHazards(MachineInstr *MI);
+ bool fixWMMACoexecutionHazards(MachineInstr *MI);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
bool fixRequiredExportPriority(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 7b8f0f4..9a2bab1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -324,7 +324,7 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
}
void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+ const SchedRegion &Region) const {
// Track register pressure so the scheduler can try to decrease
// pressure once register usage is above the threshold defined by
// SIRegisterInfo::getRegPressureSetLimit()
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 268162b..88a269f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -123,6 +123,7 @@ protected:
bool HasSMemRealTime = false;
bool HasIntClamp = false;
bool HasFmaMixInsts = false;
+ bool HasFmaMixBF16Insts = false;
bool HasMovrel = false;
bool HasVGPRIndexMode = false;
bool HasScalarDwordx3Loads = false;
@@ -244,7 +245,9 @@ protected:
bool HasVMEMtoScalarWriteHazard = false;
bool HasSMEMtoVectorWriteHazard = false;
bool HasInstFwdPrefetchBug = false;
+ bool HasVmemPrefInsts = false;
bool HasSafeSmemPrefetch = false;
+ bool HasSafeCUPrefetch = false;
bool HasVcmpxExecWARHazard = false;
bool HasLdsBranchVmemWARHazard = false;
bool HasNSAtoVMEMBug = false;
@@ -265,8 +268,10 @@ protected:
bool HasIEEEMinimumMaximumInsts = false;
bool HasMinimum3Maximum3F32 = false;
bool HasMinimum3Maximum3F16 = false;
+ bool HasMin3Max3PKF16 = false;
bool HasMinimum3Maximum3PKF16 = false;
bool HasLshlAddU64Inst = false;
+ bool HasAddSubU64Insts = false;
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;
@@ -460,6 +465,8 @@ public:
return HasFmaMixInsts;
}
+ bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; }
+
bool hasCARRY() const {
return true;
}
@@ -985,8 +992,12 @@ public:
bool hasPrefetch() const { return GFX12Insts; }
+ bool hasVmemPrefInsts() const { return HasVmemPrefInsts; }
+
bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
+ bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
+
// Has s_cmpk_* instructions.
bool hasSCmpK() const { return getGeneration() < GFX12; }
@@ -1022,7 +1033,7 @@ public:
}
void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
void mirFileLoaded(MachineFunction &MF) const override;
@@ -1162,8 +1173,14 @@ public:
bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
+ // Scalar and global loads support scale_offset bit.
+ bool hasScaleOffset() const { return GFX1250Insts; }
+
bool hasFlatGVSMode() const { return FlatGVSMode; }
+ // FLAT GLOBAL VOffset is signed
+ bool hasSignedGVSOffset() const { return GFX1250Insts; }
+
bool enableSIScheduler() const {
return EnableSIScheduler;
}
@@ -1300,7 +1317,7 @@ public:
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
- bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
+ bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
/// Return if operations acting on VGPR tuples require even alignment.
bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; }
@@ -1381,6 +1398,8 @@ public:
return HasMinimum3Maximum3F16;
}
+ bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; }
+
bool hasTanhInsts() const { return HasTanhInsts; }
bool hasAddPC64Inst() const { return GFX1250Insts; }
@@ -1494,6 +1513,18 @@ public:
bool hasVOPD3() const { return GFX1250Insts; }
+ // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
+ bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
+
+ // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
+ bool hasVectorMulU64() const { return GFX1250Insts; }
+
+ // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
+ bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
+
+ // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
+ bool hasPkMinMax3Insts() const { return GFX1250Insts; }
+
// \returns true if target has S_SETPRIO_INC_WG instruction.
bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 44d2f94..11b072e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -157,6 +157,9 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
const int64_t TH = Imm & CPol::TH;
const int64_t Scope = Imm & CPol::SCOPE;
+ if (Imm & CPol::SCAL)
+ O << " scale_offset";
+
printTH(MI, TH, Scope, O);
printScope(Scope, O);
@@ -1345,6 +1348,48 @@ void AMDGPUInstPrinter::printIndexKey32bit(const MCInst *MI, unsigned OpNo,
O << " index_key:" << Imm;
}
+void AMDGPUInstPrinter::printMatrixFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O, char AorB) {
+ auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+ if (Imm == 0)
+ return;
+
+ O << " matrix_" << AorB << "_fmt:";
+ switch (Imm) {
+ default:
+ O << Imm;
+ break;
+ case WMMA::MatrixFMT::MATRIX_FMT_FP8:
+ O << "MATRIX_FMT_FP8";
+ break;
+ case WMMA::MatrixFMT::MATRIX_FMT_BF8:
+ O << "MATRIX_FMT_BF8";
+ break;
+ case WMMA::MatrixFMT::MATRIX_FMT_FP6:
+ O << "MATRIX_FMT_FP6";
+ break;
+ case WMMA::MatrixFMT::MATRIX_FMT_BF6:
+ O << "MATRIX_FMT_BF6";
+ break;
+ case WMMA::MatrixFMT::MATRIX_FMT_FP4:
+ O << "MATRIX_FMT_FP4";
+ break;
+ }
+}
+
+void AMDGPUInstPrinter::printMatrixAFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printMatrixFMT(MI, OpNo, STI, O, 'a');
+}
+
+void AMDGPUInstPrinter::printMatrixBFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printMatrixFMT(MI, OpNo, STI, O, 'b');
+}
+
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index e3299a6..e0b7aa5 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -134,6 +134,12 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printIndexKey32bit(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMatrixFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O, char AorB);
+ void printMatrixAFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMatrixBFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpSlot(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpAttr(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index f48739f..c49ad79 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -384,6 +384,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
if (((Desc.TSFlags & SIInstrFlags::VOP3P) ||
Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) &&
+ // Matrix B format operand reuses op_sel_hi.
+ !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_fmt) &&
// Matrix B reuse operand reuses op_sel_hi.
!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) {
Encoding |= getImplicitOpSelHiEncoding(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 429ce0e0..a33dbfa 100644
--- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -270,5 +270,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
MI.eraseFromParent();
}
}
+ finalizeBundles(MF);
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
index 2a3b42e..eff5b0a 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@@ -138,7 +138,6 @@ void R600PassConfig::addPreSched2() {
void R600PassConfig::addPreEmitPass() {
addPass(createR600MachineCFGStructurizerPass());
addPass(createR600ExpandSpecialInstrsPass());
- addPass(&FinalizeMachineBundlesID);
addPass(createR600Packetizer());
addPass(createR600ControlFlowFinalizer());
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index edc74605..40b8bcd 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -392,16 +392,20 @@ enum CPol {
TH_ATOMIC_CASCADE = 4, // Cascading vs regular
// Scope
- SCOPE = 0x3 << 3, // All Scope bits
- SCOPE_CU = 0 << 3,
- SCOPE_SE = 1 << 3,
- SCOPE_DEV = 2 << 3,
- SCOPE_SYS = 3 << 3,
+ SCOPE_SHIFT = 3,
+ SCOPE_MASK = 0x3,
+ SCOPE = SCOPE_MASK << SCOPE_SHIFT, // All Scope bits
+ SCOPE_CU = 0 << SCOPE_SHIFT,
+ SCOPE_SE = 1 << SCOPE_SHIFT,
+ SCOPE_DEV = 2 << SCOPE_SHIFT,
+ SCOPE_SYS = 3 << SCOPE_SHIFT,
NV = 1 << 5, // Non-volatile bit
SWZ = 1 << 6, // Swizzle bit
+ SCAL = 1 << 11, // Scale offset bit
+
ALL = TH | SCOPE,
// Helper bits
@@ -1005,6 +1009,16 @@ enum Target : unsigned {
} // namespace Exp
+namespace WMMA {
+enum MatrixFMT : unsigned {
+ MATRIX_FMT_FP8 = 0,
+ MATRIX_FMT_BF8 = 1,
+ MATRIX_FMT_FP6 = 2,
+ MATRIX_FMT_BF6 = 3,
+ MATRIX_FMT_FP4 = 4
+};
+} // namespace WMMA
+
namespace VOP3PEncoding {
enum OpSel : uint64_t {
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index e172c0b..e5d1eaa 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1209,18 +1209,24 @@ void SIFoldOperandsImpl::foldOperand(
return;
}
- // A frame index will resolve to a positive constant, so it should always be
- // safe to fold the addressing mode, even pre-GFX9.
- UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
-
const unsigned Opc = UseMI->getOpcode();
if (TII->isFLATScratch(*UseMI) &&
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
+ unsigned CPol =
+ TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
+ if ((CPol & AMDGPU::CPol::SCAL) &&
+ !AMDGPU::supportsScaleOffset(*TII, NewOpc))
+ return;
+
UseMI->setDesc(TII->get(NewOpc));
}
+ // A frame index will resolve to a positive constant, so it should always be
+ // safe to fold the addressing mode, even pre-GFX9.
+ UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
+
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6a38679..11552b3 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -946,8 +946,18 @@ static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
- ScratchExecCopy = findScratchNonCalleeSaveRegister(
- MRI, LiveUnits, *TRI.getWaveMaskRegClass());
+ if (FuncInfo->isWholeWaveFunction()) {
+ // Whole wave functions already have a copy of the original EXEC mask that
+ // we can use.
+ assert(IsProlog && "Epilog should look at return, not setup");
+ ScratchExecCopy =
+ TII->getWholeWaveFunctionSetup(MF)->getOperand(0).getReg();
+ assert(ScratchExecCopy && "Couldn't find copy of EXEC");
+ } else {
+ ScratchExecCopy = findScratchNonCalleeSaveRegister(
+ MRI, LiveUnits, *TRI.getWaveMaskRegClass());
+ }
+
if (!ScratchExecCopy)
report_fatal_error("failed to find free scratch register");
@@ -996,10 +1006,15 @@ void SIFrameLowering::emitCSRSpillStores(
};
StoreWWMRegisters(WWMScratchRegs);
+
+ auto EnableAllLanes = [&]() {
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+ };
+
if (!WWMCalleeSavedRegs.empty()) {
if (ScratchExecCopy) {
- unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+ EnableAllLanes();
} else {
ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ true,
@@ -1008,7 +1023,18 @@ void SIFrameLowering::emitCSRSpillStores(
}
StoreWWMRegisters(WWMCalleeSavedRegs);
- if (ScratchExecCopy) {
+ if (FuncInfo->isWholeWaveFunction()) {
+ // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove
+ // it now. If we have already saved some WWM CSR registers, then the EXEC is
+ // already -1 and we don't need to do anything else. Otherwise, set EXEC to
+ // -1 here.
+ if (!ScratchExecCopy)
+ buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
+ /*EnableInactiveLanes*/ true);
+ else if (WWMCalleeSavedRegs.empty())
+ EnableAllLanes();
+ TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
+ } else if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
@@ -1083,11 +1109,6 @@ void SIFrameLowering::emitCSRSpillRestores(
Register ScratchExecCopy;
SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
- if (!WWMScratchRegs.empty())
- ScratchExecCopy =
- buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
- /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
-
auto RestoreWWMRegisters =
[&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
for (const auto &Reg : WWMRegs) {
@@ -1098,6 +1119,36 @@ void SIFrameLowering::emitCSRSpillRestores(
}
};
+ if (FuncInfo->isWholeWaveFunction()) {
+ // For whole wave functions, the EXEC is already -1 at this point.
+ // Therefore, we can restore the CSR WWM registers right away.
+ RestoreWWMRegisters(WWMCalleeSavedRegs);
+
+ // The original EXEC is the first operand of the return instruction.
+ const MachineInstr &Return = MBB.instr_back();
+ assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN &&
+ "Unexpected return inst");
+ Register OrigExec = Return.getOperand(0).getReg();
+
+ if (!WWMScratchRegs.empty()) {
+ unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
+ BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec())
+ .addReg(OrigExec)
+ .addImm(-1);
+ RestoreWWMRegisters(WWMScratchRegs);
+ }
+
+ // Restore original EXEC.
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
+ return;
+ }
+
+ if (!WWMScratchRegs.empty()) {
+ ScratchExecCopy =
+ buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
+ /*IsProlog=*/false, /*EnableInactiveLanes=*/true);
+ }
RestoreWWMRegisters(WWMScratchRegs);
if (!WWMCalleeSavedRegs.empty()) {
if (ScratchExecCopy) {
@@ -1634,6 +1685,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
NeedExecCopyReservedReg = true;
else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+ MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
(MFI->isChainFunction() &&
TII->isChainCallOpcode(MI.getOpcode()))) {
// We expect all return to be the same size.
@@ -1662,6 +1714,21 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (MFI->isEntryFunction())
return;
+ if (MFI->isWholeWaveFunction()) {
+ // In practice, all the VGPRs are WWM registers, and we will need to save at
+ // least their inactive lanes. Add them to WWMReservedRegs.
+ assert(!NeedExecCopyReservedReg &&
+ "Whole wave functions can use the reg mapped for their i1 argument");
+
+ // FIXME: Be more efficient!
+ for (MCRegister Reg : AMDGPU::VGPR_32RegClass)
+ if (MF.getRegInfo().isPhysRegModified(Reg)) {
+ MFI->reserveWWMRegister(Reg);
+ MF.begin()->addLiveIn(Reg);
+ }
+ MF.begin()->sortUniqueLiveIns();
+ }
+
// Remove any VGPRs used in the return value because these do not need to be saved.
// This prevents CSR restore from clobbering return VGPRs.
if (ReturnMI) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0c76ff2..0eee7ad 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -618,6 +618,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::FSIN, ISD::FROUND},
MVT::f16, Custom);
+ // BF16 - VOP1 Actions.
+ if (Subtarget->hasBF16TransInsts())
+ setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
+
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
@@ -870,13 +874,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
- if (Subtarget->hasScalarSMulU64())
+ if (Subtarget->hasVectorMulU64())
+ setOperationAction(ISD::MUL, MVT::i64, Legal);
+ else if (Subtarget->hasScalarSMulU64())
setOperationAction(ISD::MUL, MVT::i64, Custom);
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
- if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch())
+ if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
if (Subtarget->hasIEEEMinimumMaximumInsts()) {
@@ -940,6 +946,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
}
+ if (Subtarget->hasBF16PackedInsts()) {
+ setOperationAction(
+ {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
+ MVT::v2bf16, Legal);
+ }
+
if (Subtarget->hasBF16TransInsts()) {
setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
}
@@ -1049,10 +1061,12 @@ ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
// where this is OK to use.
bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
EVT DestVT, EVT SrcVT) const {
- return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
- (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
- DestVT.getScalarType() == MVT::f32 &&
- SrcVT.getScalarType() == MVT::f16 &&
+ return DestVT.getScalarType() == MVT::f32 &&
+ ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
+ SrcVT.getScalarType() == MVT::f16) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
+ SrcVT.getScalarType() == MVT::bf16)) &&
// TODO: This probably only requires no input flushing?
denormalModeIsFlushAllF32(DAG.getMachineFunction());
}
@@ -1463,6 +1477,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
case Intrinsic::amdgcn_ds_load_tr6_b96:
case Intrinsic::amdgcn_ds_load_tr4_b64:
case Intrinsic::amdgcn_ds_load_tr8_b64:
@@ -1536,7 +1556,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
- case Intrinsic::amdgcn_s_prefetch_data: {
+ case Intrinsic::amdgcn_s_prefetch_data:
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
Info.ptrVal = CI.getArgOperand(0);
@@ -1587,10 +1609,16 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
case Intrinsic::amdgcn_global_load_tr4_b64:
@@ -2260,7 +2288,8 @@ SDValue SITargetLowering::getPreloadedValue(
const ArgDescriptor WorkGroupIDZ =
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
if (Subtarget->hasArchitectedSGPRs() &&
- (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
+ (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx ||
+ CC == CallingConv::AMDGPU_Gfx_WholeWave)) {
switch (PVID) {
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
Reg = &WorkGroupIDX;
@@ -2942,12 +2971,15 @@ SDValue SITargetLowering::LowerFormalArguments(
if (!Subtarget->enableFlatScratch())
assert(!UserSGPRInfo.hasFlatScratchInit());
if ((CallConv != CallingConv::AMDGPU_CS &&
- CallConv != CallingConv::AMDGPU_Gfx) ||
+ CallConv != CallingConv::AMDGPU_Gfx &&
+ CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
!Subtarget->hasArchitectedSGPRs())
assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
!Info->hasWorkGroupIDZ());
}
+ bool IsWholeWaveFunc = Info->isWholeWaveFunction();
+
if (CallConv == CallingConv::AMDGPU_PS) {
processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
@@ -2988,7 +3020,8 @@ SDValue SITargetLowering::LowerFormalArguments(
} else if (IsKernel) {
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
} else {
- Splits.append(Ins.begin(), Ins.end());
+ Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
+ Ins.end());
}
if (IsKernel)
@@ -3019,6 +3052,13 @@ SDValue SITargetLowering::LowerFormalArguments(
SmallVector<SDValue, 16> Chains;
+ if (IsWholeWaveFunc) {
+ SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
+ {MVT::i1, MVT::Other}, Chain);
+ InVals.push_back(Setup.getValue(0));
+ Chains.push_back(Setup.getValue(1));
+ }
+
// FIXME: This is the minimum kernel argument alignment. We should improve
// this to the maximum alignment of the arguments.
//
@@ -3026,7 +3066,8 @@ SDValue SITargetLowering::LowerFormalArguments(
// kern arg offset.
const Align KernelArgBaseAlign = Align(16);
- for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+ for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
+ ++i) {
const ISD::InputArg &Arg = Ins[i];
if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
InVals.push_back(DAG.getPOISON(Arg.VT));
@@ -3374,7 +3415,9 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
unsigned Opc = AMDGPUISD::ENDPGM;
if (!IsWaveEnd)
- Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
+ Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
+ : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
+ : AMDGPUISD::RET_GLUE;
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
@@ -3876,7 +3919,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
- if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
+ if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
+ CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
// With a fixed ABI, allocate fixed registers before user arguments.
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
@@ -4412,19 +4456,28 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
}
SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
- if (Op->isDivergent())
+ if (Op->isDivergent() &&
+ (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
+ // Cannot do I$ prefetch with divergent pointer.
return SDValue();
switch (cast<MemSDNode>(Op)->getAddressSpace()) {
case AMDGPUAS::FLAT_ADDRESS:
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::CONSTANT_ADDRESS:
- case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
break;
+ case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+ if (Subtarget->hasSafeSmemPrefetch())
+ break;
+ [[fallthrough]];
default:
return SDValue();
}
+ // I$ prefetch
+ if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
+ return SDValue();
+
return Op;
}
@@ -5395,6 +5448,19 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
+ if (ST.hasAddSubU64Insts()) {
+ auto I = BuildMI(*BB, MI, DL,
+ TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
+ : AMDGPU::V_SUB_U64_e64),
+ Dest.getReg())
+ .add(Src0)
+ .add(Src1)
+ .addImm(0); // clamp
+ TII->legalizeOperands(*I);
+ MI.eraseFromParent();
+ return BB;
+ }
+
if (IsAdd && ST.hasLshlAddU64Inst()) {
auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
Dest.getReg())
@@ -5890,6 +5956,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent();
return SplitBB;
}
+ case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
+ assert(MFI->isWholeWaveFunction());
+
+ // During ISel, it's difficult to propagate the original EXEC mask to use as
+ // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
+ MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
+ Register OriginalExec = Setup->getOperand(0).getReg();
+ assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
+ MF->getRegInfo().clearKillFlags(OriginalExec);
+ MI.getOperand(0).setReg(OriginalExec);
+ return BB;
+ }
default:
if (TII->isImage(MI) || TII->isMUBUF(MI)) {
if (!MI.mayStore())
@@ -11172,7 +11250,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
// Without !fpmath accuracy information, we can't do more because we don't
// know exactly whether rcp is accurate enough to meet !fpmath requirement.
// f16 is always accurate enough
- if (!AllowInaccurateRcp && VT != MVT::f16)
+ if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
return SDValue();
if (CLHS->isExactlyValue(1.0)) {
@@ -11199,9 +11277,10 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
}
}
- // For f16 require afn or arcp.
+ // For f16 and bf16 require afn or arcp.
// For f32 require afn.
- if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
+ if (!AllowInaccurateRcp &&
+ ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
return SDValue();
// Turn into multiply by the reciprocal.
@@ -11592,7 +11671,7 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
if (VT == MVT::f64)
return LowerFDIV64(Op, DAG);
- if (VT == MVT::f16)
+ if (VT == MVT::f16 || VT == MVT::bf16)
return LowerFDIV16(Op, DAG);
llvm_unreachable("Unexpected type for fdiv");
@@ -13600,6 +13679,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_trig_preop:
+ case Intrinsic::amdgcn_tanh:
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
case Intrinsic::amdgcn_sqrt:
@@ -14013,7 +14093,8 @@ static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
case ISD::FMAXIMUMNUM:
case AMDGPUISD::FMIN_LEGACY:
case AMDGPUISD::FMAX_LEGACY:
- return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
+ return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
+ (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 2af0a57..dd3f2fe 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1812,6 +1812,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// with knowledge of the called routines.
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
MI.getOpcode() == AMDGPU::SI_RETURN ||
+ MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
@@ -2107,8 +2108,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
assert(TII->isFLAT(MI));
- // All flat instructions use the VMEM counter.
- assert(TII->usesVM_CNT(MI));
+ // All flat instructions use the VMEM counter except prefetch.
+ if (!TII->usesVM_CNT(MI))
+ return false;
// If there are no memory operands then conservatively assume the flat
// operation may access VMEM.
@@ -2294,9 +2296,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
}
- // A Flat memory operation must access at least one address space.
- assert(FlatASCount);
-
// This is a flat memory operation that access both VMEM and LDS, so note it
// - it will require that both the VM and LGKM be flushed to zero if it is
// pending when a VM or LGKM dependency occurs.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 6b41934..89d9b0d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -318,6 +318,7 @@ def CPolBit {
int DLC = 2;
int SCC = 4;
int NV = 5;
+ int SCAL = 11;
}
class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c8935f0..40e6871 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2472,6 +2472,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
+ case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
case AMDGPU::SI_RETURN: {
const MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -5481,6 +5482,19 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
+ if (CPol->getImm() & AMDGPU::CPol::SCAL) {
+ if (!ST.hasScaleOffset()) {
+ ErrInfo = "Subtarget does not support offset scaling";
+ return false;
+ }
+ if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
+ ErrInfo = "Instruction does not support offset scaling";
+ return false;
+ }
+ }
+ }
+
return true;
}
@@ -5757,6 +5771,19 @@ void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
}
+MachineInstr *
+SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
+ assert(MF.getInfo<SIMachineFunctionInfo>()->isWholeWaveFunction() &&
+ "Not a whole wave func");
+ MachineBasicBlock &MBB = *MF.begin();
+ for (MachineInstr &MI : MBB)
+ if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
+ MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
+ return &MI;
+
+ llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
+}
+
static const TargetRegisterClass *
adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
const MachineRegisterInfo &MRI,
@@ -7334,6 +7361,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MUL_U64:
+ if (ST.hasVectorMulU64()) {
+ NewOpcode = AMDGPU::V_MUL_U64_e64;
+ break;
+ }
// Split s_mul_u64 in 32-bit vector multiplications.
splitScalarSMulU64(Worklist, Inst, MDT);
Inst.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5e92921..800ea9a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1215,6 +1215,8 @@ public:
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
Register Reg, SlotIndexes *Indexes = nullptr) const;
+ MachineInstr *getWholeWaveFunctionSetup(MachineFunction &MF) const;
+
/// Return the correct register class for \p OpNo. For target-specific
/// instructions, this will return the register class that has been defined
/// in tablegen. For generic instructions, like REG_SEQUENCE it will return
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 9e1951e..b0be3f86 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1307,6 +1307,9 @@ let PrintMethod = "printBitOp3" in
def BitOp3 : NamedIntOperand<"bitop3">;
def bitop3_0 : DefaultOperand<BitOp3, 0>;
+def MatrixAFMT : CustomOperand<i32, 1, "MatrixAFMT">;
+def MatrixBFMT : CustomOperand<i32, 1, "MatrixBFMT">;
+
def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">;
def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">;
@@ -1659,6 +1662,8 @@ def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">;
def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
+def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">;
+def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">;
def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
@@ -1882,6 +1887,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
!eq(VT, v4bf16) : AVSrc_64,
!eq(VT.Size, 1024) : VRegSrc_1024,
!eq(VT.Size, 512) : VRegSrc_512,
+ !eq(VT.Size, 384) : VRegSrc_384,
!eq(VT.Size, 256) : VRegSrc_256,
!eq(VT.Size, 192) : VRegSrc_192,
!eq(VT.Size, 128) : VRegSrc_128,
@@ -1894,6 +1900,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
class getVOP3VRegSrcForVT<ValueType VT> {
RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
!eq(VT.Size, 512) : VRegSrc_512,
+ !eq(VT.Size, 384) : VRegSrc_384,
!eq(VT.Size, 256) : VRegSrc_256,
!eq(VT.Size, 192) : VRegSrc_192,
!eq(VT.Size, 128) : VRegSrc_128,
@@ -2666,6 +2673,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
HasOMod);
field bit HasNeg = HasModifiers;
field bit HasMatrixReuse = 0;
+ field bit HasMatrixFMT = 0;
field bit HasSrc0Mods = HasModifiers;
field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0);
@@ -2860,6 +2868,7 @@ def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=
def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
+def VOP_BF16_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, bf16, untyped]>;
def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>;
def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>;
@@ -2867,10 +2876,12 @@ def VOP_I16_I32 : VOPProfile <[i16, i32, untyped, untyped]>;
def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>;
+def VOP_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, untyped]>;
def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>;
def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>;
def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>;
+def VOP_V2BF16_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, v2bf16]>;
def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>;
def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>;
@@ -2906,8 +2917,10 @@ def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>;
def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>;
+def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>;
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
+def VOP_F32_BF16_BF16_BF16 : VOPProfile <[f32, bf16, bf16, bf16]>;
def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>;
def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 991d9f8..d05be8f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -644,6 +644,32 @@ def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
let isConvergent = 1;
}
+// Sets EXEC to all lanes and returns the previous EXEC.
+def SI_WHOLE_WAVE_FUNC_SETUP : SPseudoInstSI <
+ (outs SReg_1:$dst), (ins), [(set i1:$dst, (AMDGPUwhole_wave_setup))]> {
+ let Defs = [EXEC];
+ let Uses = [EXEC];
+
+ let isConvergent = 1;
+}
+
+// Restores the previous EXEC and otherwise behaves entirely like a SI_RETURN.
+def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI <
+ (outs), (ins SReg_1:$orig_exec)> {
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+ let SchedRW = [WriteBranch];
+
+ // We're going to use custom handling to set the $orig_exec to the correct value.
+ let usesCustomInserter = 1;
+}
+
+// Generate a SI_WHOLE_WAVE_FUNC_RETURN pseudo with a placeholder for its
+// argument. It will be filled in by the custom inserter.
+def : GCNPat<
+ (AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>;
+
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
@@ -2473,6 +2499,7 @@ def : AMDGPUPat <
>;
let True16Predicate = NotHasTrue16BitInsts in {
+let SubtargetPredicate = isNotGFX9Plus in {
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
@@ -2482,6 +2509,35 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
+} // isNotGFX9Plus
+
+let SubtargetPredicate = isGFX9GFX10 in {
+def : GCNPat <
+ (rotr i32:$src0, i32:$src1),
+ (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
+ /* src1_modifiers */ 0, $src0,
+ /* src2_modifiers */ 0,
+ $src1, /* clamp */ 0, /* op_sel */ 0)
+>;
+
+foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
+ (i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
+def : GCNPat<pat,
+ (V_ALIGNBIT_B32_opsel_e64 0, /* src0_modifiers */
+ (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+ 0, /* src1_modifiers */
+ (i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
+ 0, /* src2_modifiers */
+ $src1, /* clamp */ 0, /* op_sel */ 0)
+>;
+
+def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
+ (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
+ /* src1_modifiers */ 0, $src1,
+ /* src2_modifiers */ 0,
+ $src2, /* clamp */ 0, /* op_sel */ 0)
+>;
+} // isGFX9GFX10
} // end True16Predicate = NotHasTrue16BitInsts
let True16Predicate = UseRealTrue16Insts in {
@@ -3082,6 +3138,8 @@ def : GCNPat <
(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
>;
+// This pattern for bswap is used for pre-GFX8. For GFX8+, bswap is mapped
+// to V_PERM_B32.
let True16Predicate = NotHasTrue16BitInsts in
def : GCNPat <
(i32 (bswap i32:$a)),
@@ -3559,15 +3617,20 @@ def : GCNPat <
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
// Special case, can use V_ALIGNBIT (always uses encoded literal)
-let True16Predicate = NotHasTrue16BitInsts in
-def : GCNPat <
+let True16Predicate = NotHasTrue16BitInsts in {
+defvar BuildVectorToAlignBitPat =
(vecTy (DivergentBinFrag<build_vector>
(Ty !if(!eq(Ty, i16),
(Ty (trunc (srl VGPR_32:$a, (i32 16)))),
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))),
- (Ty VGPR_32:$b))),
- (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))
->;
+ (Ty VGPR_32:$b)));
+
+let SubtargetPredicate = isNotGFX9Plus in
+def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))>;
+
+let SubtargetPredicate = isGFX9GFX10 in
+def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_opsel_e64 0, VGPR_32:$b, 0, VGPR_32:$a, 0, (i32 16), 0, 0)>;
+} //True16Predicate = NotHasTrue16BitInsts
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
@@ -4300,6 +4363,20 @@ def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction {
let hasSideEffects = 0;
}
+def G_AMDGPU_WHOLE_WAVE_FUNC_SETUP : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$origExec);
+ let InOperandList = (ins);
+ let isConvergent = 1;
+}
+
+def G_AMDGPU_WHOLE_WAVE_FUNC_RETURN : AMDGPUGenericInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins type0:$origExec);
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+}
+
// This is equivalent to the G_INTRINSIC*, but the operands may have
// been legalized depending on the subtarget requirements.
def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 5097ac03..b49c5a9 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -61,6 +61,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
@@ -1078,7 +1079,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
if (EltOffset0 + CI.Width != EltOffset1 &&
EltOffset1 + Paired.Width != EltOffset0)
return false;
- if (CI.CPol != Paired.CPol)
+ // Instructions with scale_offset modifier cannot be combined unless we
+ // also generate a code to scale the offset and reset that bit.
+ if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL))
return false;
if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 8c2e9b62..f0be204 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -51,7 +51,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
- GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
+ GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
+ IsWholeWaveFunction(F.getCallingConv() ==
+ CallingConv::AMDGPU_Gfx_WholeWave) {
const GCNSubtarget &ST = *STI;
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
@@ -99,7 +101,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
ImplicitArgPtr = false;
} else if (!isEntryFunction()) {
- if (CC != CallingConv::AMDGPU_Gfx)
+ if (CC != CallingConv::AMDGPU_Gfx &&
+ CC != CallingConv::AMDGPU_Gfx_WholeWave)
ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
FrameOffsetReg = AMDGPU::SGPR33;
@@ -732,6 +735,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
+ IsWholeWaveFunction(MFI.isWholeWaveFunction()),
DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
@@ -778,6 +782,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
ReturnsVoid = YamlMFI.ReturnsVoid;
+ IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
if (YamlMFI.ScavengeFI) {
auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 274a60ad..08b0206 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -298,6 +298,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
StringValue LongBranchReservedReg;
bool HasInitWholeWave = false;
+ bool IsWholeWaveFunction = false;
unsigned DynamicVGPRBlockSize = 0;
unsigned ScratchReservedForDynamicVGPRs = 0;
@@ -356,6 +357,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
MFI.ScratchReservedForDynamicVGPRs, 0);
+ YamlIO.mapOptional("isWholeWaveFunction", MFI.IsWholeWaveFunction, false);
}
};
@@ -565,6 +567,8 @@ private:
// the serialization easier.
ReservedRegSet WWMReservedRegs;
+ bool IsWholeWaveFunction = false;
+
using PrologEpilogSGPRSpill =
std::pair<Register, PrologEpilogSGPRSaveRestoreInfo>;
// To track the SGPR spill method used for a CSR SGPR register during
@@ -670,6 +674,8 @@ public:
return WWMReservedRegs.contains(Reg);
}
+ bool isWholeWaveFunction() const { return IsWholeWaveFunction; }
+
ArrayRef<PrologEpilogSGPRSpill> getPrologEpilogSGPRSpills() const {
assert(is_sorted(PrologEpilogSGPRSpills, llvm::less_first()));
return PrologEpilogSGPRSpills;
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 3212060..0e8a420 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -704,16 +704,16 @@ void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
}
-/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
-/// If this tag isn't present, or if it has no meaningful values, returns \p
-/// Default. Otherwise returns all the address spaces concerned by the MMRA.
-static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
- SIAtomicAddrSpace Default) {
- static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
+/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
+/// If this tag isn't present, or if it has no meaningful values, returns
+/// \p none, otherwise returns the address spaces specified by the MD.
+static std::optional<SIAtomicAddrSpace>
+getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
+ static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
if (!MMRA)
- return Default;
+ return std::nullopt;
SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
for (const auto &[Prefix, Suffix] : MMRA) {
@@ -726,7 +726,10 @@ static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
diagnoseUnknownMMRAASName(MI, Suffix);
}
- return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
+ if (Result == SIAtomicAddrSpace::NONE)
+ return std::nullopt;
+
+ return Result;
}
} // end anonymous namespace
@@ -903,12 +906,19 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
*ScopeOrNone;
- if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
- ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+ if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
+ // We currently expect refineOrderingAS to be the only place that
+ // can refine the AS ordered by the fence.
+ // If that changes, we need to review the semantics of that function
+ // in case it needs to preserve certain address spaces.
reportUnsupported(MI, "Unsupported atomic address space");
return std::nullopt;
}
+ auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
+ if (SynchronizeAS)
+ OrderingAddrSpace = *SynchronizeAS;
+
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
}
@@ -2687,11 +2697,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
AtomicPseudoMIs.push_back(MI);
bool Changed = false;
- // Refine fenced address space based on MMRAs.
- //
- // TODO: Should we support this MMRA on other atomic operations?
- auto OrderingAddrSpace =
- getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
+ const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 7093fe6..5940f45 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -85,7 +85,8 @@ static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
S_00B848_PRIV(ProgInfo.Priv) |
S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
S_00B848_WGP_MODE(ProgInfo.WgpMode) |
- S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
+ S_00B848_MEM_ORDERED(ProgInfo.MemOrdered) |
+ S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress);
if (ST.hasDX10ClampMode())
Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
@@ -93,10 +94,6 @@ static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
if (ST.hasIEEEMode())
Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
- // TODO: in the long run we will want to enable this unconditionally.
- if (ST.getTargetTriple().getOS() == Triple::OSType::AMDHSA)
- Reg |= S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress);
-
if (ST.hasRrWGMode())
Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index fa2b8db..84cfa87 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -407,6 +407,7 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
: CSR_AMDGPU_SaveList;
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
: CSR_AMDGPU_SI_Gfx_SaveList;
case CallingConv::AMDGPU_CS_ChainPreserve:
@@ -433,6 +434,7 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
: CSR_AMDGPU_RegMask;
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
: CSR_AMDGPU_SI_Gfx_RegMask;
case CallingConv::AMDGPU_CS_Chain:
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c194e5c..0039d2f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1207,6 +1207,7 @@ def VRegSrc_96 : SrcReg9<VReg_96>;
def VRegSrc_128: SrcReg9<VReg_128>;
def VRegSrc_192: SrcReg9<VReg_192>;
def VRegSrc_256: SrcReg9<VReg_256>;
+def VRegSrc_384: SrcReg9<VReg_384>;
def VRegSrc_512: SrcReg9<VReg_512>;
def VRegSrc_1024: SrcReg9<VReg_1024>;
def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>;
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index ef8faff..8eecb1c 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -464,6 +464,20 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
} // End SchedModel = GFX12SpeedModel
+// Check if any matrix inputs are interpreted as f8 in an f8f6f4
+// wmma instruction.
+def PredIsF8_WMMA_SCALE : SchedPredicate<[{
+ TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_a_fmt)->getImm() <= AMDGPU::WMMA::MATRIX_FMT_BF8 ||
+ TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_b_fmt)->getImm() <= AMDGPU::WMMA::MATRIX_FMT_BF8
+}]>;
+
+// If either matrix format is f8, the instruction takes 2x as many
+// cycles. TODO: This isn't reflected in MCA.
+def WriteWMMAScale_16X16X128_F8F6F4 : SchedWriteVariant<[
+ SchedVar<PredIsF8_WMMA_SCALE, [WriteXDL4PassWMMA]>,
+ SchedVar<NoSchedPred, [WriteXDL2PassWMMA]>
+]>;
+
multiclass GFX125xCommonWriteRes {
let ReleaseAtCycles = [8] in
@@ -495,6 +509,7 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>;
def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>;
+def : InstRW<[WriteWMMAScale_16X16X128_F8F6F4], (instregex "^V_WMMA_.*_16X16X128_F8F6F4.*_w32")>;
def : InstRW<[Write4PassWMMA], (instregex "^V_WMMA_F32_16X16X4_F32_w32")>;
def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>;
} // End GFX125xCommonWriteRes
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index d8b52d2..4bda51d 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -856,16 +856,18 @@ def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>;
def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
(prefetch node:$ptr, node:$rw, node:$loc, node:$type),
- [{ return !N->getOperand(1)->isDivergent();}]> {
+ [{ return !N->getOperand(1)->isDivergent() && Subtarget->hasSafeSmemPrefetch();}]> {
let GISelPredicateCode = [{
- return isInstrUniform(MI);
+ return isInstrUniform(MI) && Subtarget->hasSafeSmemPrefetch();
}];
}
def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
-def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
-def SMRDSgprImm : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">;
+let WantsRoot = true in {
+ def SMRDSgpr : ComplexPattern<iPTR, 3, "SelectSMRDSgpr", [], [], -3>;
+ def SMRDSgprImm : ComplexPattern<iPTR, 4, "SelectSMRDSgprImm", [], []>;
+}
def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
@@ -906,15 +908,15 @@ multiclass SMRD_Patterns <string Instr, ValueType vt, PatFrag frag,
let SubtargetPredicate = isNotGFX9Plus;
}
def : GCNPat <
- (frag (SMRDSgpr i64:$sbase, i32:$soffset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> {
+ (frag (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, $cpol))> {
let SubtargetPredicate = isGFX9Plus;
}
// 4. SGPR+IMM offset
def : GCNPat <
- (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> {
+ (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, $cpol))> {
let SubtargetPredicate = isGFX9Plus;
}
@@ -989,15 +991,15 @@ multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, Val
// 2. SGPR offset
def : GCNPat <
- (node (SMRDSgpr i64:$sbase, i32:$soffset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{
+ (node (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, $cpol))>{
let SubtargetPredicate = isGFX12Plus;
}
// 3. SGPR+IMM offset
def : GCNPat <
- (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{
+ (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, $cpol))>{
let SubtargetPredicate = isGFX12Plus;
}
@@ -1150,6 +1152,7 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
}
defm : SMPrefetchPat<"INST", i32imm_zero>;
+let AddedComplexity = 12 in // Prefer scalar prefetch over global for r/o case.
defm : SMPrefetchPat<"DATA", i32imm_one>;
let SubtargetPredicate = isGFX12Plus in {
@@ -1488,6 +1491,7 @@ class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offs
let Inst{20} = cpol{CPolBit.NV}; // non-volatile
let Inst{22-21} = cpol{4-3}; // scope
let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported
+ let Inst{56} = cpol{CPolBit.SCAL}; // scale offset
}
multiclass SM_Real_Loads_gfx12<bits<6> op, string ps = NAME> {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 7725881..b5b3cc9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -598,6 +598,29 @@ const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode);
}
+uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt) {
+ switch (Fmt) {
+ case WMMA::MATRIX_FMT_FP8:
+ case WMMA::MATRIX_FMT_BF8:
+ return 16;
+ case WMMA::MATRIX_FMT_FP6:
+ case WMMA::MATRIX_FMT_BF6:
+ return 12;
+ case WMMA::MATRIX_FMT_FP4:
+ return 8;
+ }
+
+ llvm_unreachable("covered switch over wmma scale formats");
+}
+
+const MFMA_F8F6F4_Info *getWMMA_F8F6F4_WithFormatArgs(unsigned FmtA,
+ unsigned FmtB,
+ unsigned F8F8Opcode) {
+ uint8_t SrcANumRegs = wmmaScaleF8F6F4FormatToNumRegs(FmtA);
+ uint8_t SrcBNumRegs = wmmaScaleF8F6F4FormatToNumRegs(FmtB);
+ return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode);
+}
+
unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST) {
if (ST.hasFeature(AMDGPU::FeatureGFX1250Insts))
return SIEncodingFamily::GFX1250;
@@ -3205,6 +3228,25 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
: getGfx9BufferFormatInfo(Format);
}
+bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) {
+ uint64_t TSFlags = MII.get(Opcode).TSFlags;
+
+ if (TSFlags & SIInstrFlags::SMRD)
+ return !getSMEMIsBuffer(Opcode);
+ if (!(TSFlags & SIInstrFlags::FLAT))
+ return false;
+
+ // Only SV and SVS modes are supported.
+ if (TSFlags & SIInstrFlags::FlatScratch)
+ return hasNamedOperand(Opcode, OpName::vaddr);
+
+ // Only GVS mode is supported.
+ return hasNamedOperand(Opcode, OpName::vaddr) &&
+ hasNamedOperand(Opcode, OpName::saddr);
+
+ return false;
+}
+
bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) {
for (auto OpName : {OpName::vdst, OpName::src0, OpName::src1, OpName::src2}) {
int Idx = getNamedOperandIdx(OpDesc.getOpcode(), OpName);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index c9d2c28..c09a9d6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -627,6 +627,14 @@ const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
unsigned BLGP,
unsigned F8F8Opcode);
+LLVM_READNONE
+uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt);
+
+LLVM_READONLY
+const MFMA_F8F6F4_Info *getWMMA_F8F6F4_WithFormatArgs(unsigned FmtA,
+ unsigned FmtB,
+ unsigned F8F8Opcode);
+
LLVM_READONLY
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,
@@ -1423,7 +1431,8 @@ constexpr bool isShader(CallingConv::ID CC) {
LLVM_READNONE
constexpr bool isGraphics(CallingConv::ID CC) {
- return isShader(CC) || CC == CallingConv::AMDGPU_Gfx;
+ return isShader(CC) || CC == CallingConv::AMDGPU_Gfx ||
+ CC == CallingConv::AMDGPU_Gfx_WholeWave;
}
LLVM_READNONE
@@ -1748,6 +1757,9 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID);
/// \returns true if the intrinsic is uniform
bool isIntrinsicAlwaysUniform(unsigned IntrID);
+/// \returns true if a memory instruction supports scale_offset modifier.
+bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode);
+
/// \returns lds block size in terms of dwords. \p
/// This is used to calculate the lds size encoded for PAL metadata 3.0+ which
/// must be defined in terms of bytes.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index e464470..fd6253d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -44,6 +44,7 @@ static const char *getStageName(CallingConv::ID CC) {
case CallingConv::AMDGPU_LS:
return ".ls";
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
llvm_unreachable("Callable shader has no hardware stage");
default:
return ".cs";
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 030a6e1..550ec9d 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -925,6 +925,17 @@ let isAdd = 1 in {
defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">;
}
+let isReMaterializable = 1 in {
+let SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] in {
+defm V_ADD_U64 : VOP2Inst <"v_add_nc_u64", VOP_I64_I64_I64_ARITH>;
+// We don't actually have something like V_SUBREV_U64 so V_SUB_U64 can't be treated as commutable.
+let isCommutable = 0 in
+defm V_SUB_U64 : VOP2Inst <"v_sub_nc_u64", VOP_I64_I64_I64_ARITH>;
+} // End SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit]
+let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDouble] in
+defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag<mul>>;
+} // End isReMaterializable = 1
+
} // End isCommutable = 1
// These are special and do not read the exec mask.
@@ -1754,6 +1765,9 @@ multiclass VOP2_Real_FULL_with_name<GFXGen Gen, bits<6> op, string opName,
VOP2_Realtriple_e64_with_name<Gen, op, opName, asmName>,
VOP2_Real_NO_VOP3_with_name<Gen, op, opName, asmName>;
+multiclass VOP2_Real_NO_DPP<GFXGen Gen, bits<6> op> :
+ VOP2_Real_e32<Gen, op>, VOP2_Real_e64<Gen, op>;
+
multiclass VOP2_Real_NO_DPP_with_name<GFXGen Gen, bits<6> op, string opName,
string asmName> {
defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName>,
@@ -1843,6 +1857,9 @@ defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>;
defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>;
defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>;
+defm V_ADD_U64 : VOP2_Real_FULL<GFX1250Gen, 0x28>;
+defm V_SUB_U64 : VOP2_Real_FULL<GFX1250Gen, 0x29>;
+defm V_MUL_U64 : VOP2_Real_NO_DPP<GFX1250Gen, 0x2a>;
//===----------------------------------------------------------------------===//
// GFX11.
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 2e7f25b..b6f9568 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -224,6 +224,12 @@ defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32",
fshr, null_frag>;
defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
+
+// In gfx9 and 10, opsel is allowed for V_ALIGNBIT_B32 and V_ALIGNBYTE_B32.
+// Hardware uses opsel[1:0] to byte-select src2. Other opsel bits are ignored.
+defm V_ALIGNBIT_B32_opsel : VOP3Inst <"v_alignbit_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>;
+defm V_ALIGNBYTE_B32_opsel : VOP3Inst <"v_alignbyte_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>;
+
let True16Predicate = UseRealTrue16Insts in
defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>>;
let True16Predicate = UseFakeTrue16Insts in
@@ -265,6 +271,16 @@ let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
} // End isReMaterializable = 1
+let SubtargetPredicate = isGFX9GFX10 in
+def : GCNPat <
+(i32 (int_amdgcn_alignbyte (i32 (VOP3OpSelMods i32:$src0, i32:$src0_modifiers)),
+ (i32 (VOP3OpSelMods i32:$src1, i32:$src1_modifiers)),
+ (i32 (VOP3OpSelMods i32:$src2, i32:$src2_modifiers)))),
+(V_ALIGNBYTE_B32_opsel_e64 i32:$src0_modifiers, VSrc_b32:$src0,
+ i32:$src1_modifiers, VSrc_b32:$src1,
+ i32:$src2_modifiers, VGPR_32:$src2)
+>;
+
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
(i32 (int_amdgcn_alignbyte (i32 (VOP3OpSelMods i32:$src0, i32:$src0_modifiers)),
@@ -1902,6 +1918,7 @@ let AssemblerPredicate = isGFX11Plus in {
// These instructions differ from GFX12 variant by supporting DPP:
defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>;
+defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>;
//===----------------------------------------------------------------------===//
// GFX10.
@@ -1954,6 +1971,9 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
+defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14e, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">;
+defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14f, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">;
+
defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>;
let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
@@ -2104,8 +2124,8 @@ defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>;
defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>;
defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>;
defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>;
-defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>;
-defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>;
+defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7<0x14e>;
+defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7<0x14f>;
defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>;
defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>;
defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>;
@@ -2248,6 +2268,17 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0>
}
}
+// Instructions such as v_alignbyte_b32 allows op_sel in gfx9, but not in vi.
+// The following is created to support that.
+multiclass VOP3OpSel_Real_gfx9_with_name<bits<10> op, string opName, string AsmName> {
+ defvar psName = opName#"_e64";
+ def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(psName), SIEncodingFamily.VI>, // note: encoding family is VI
+ VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(psName).Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(psName);
+ let AsmString = AsmName # ps.AsmOperands;
+ }
+}
+
} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;
@@ -2267,8 +2298,10 @@ defm V_BFI_B32 : VOP3_Real_vi <0x1ca>;
defm V_FMA_F32 : VOP3_Real_vi <0x1cb>;
defm V_FMA_F64 : VOP3_Real_vi <0x1cc>;
defm V_LERP_U8 : VOP3_Real_vi <0x1cd>;
+let SubtargetPredicate = isGFX8Only in {
defm V_ALIGNBIT_B32 : VOP3_Real_vi <0x1ce>;
defm V_ALIGNBYTE_B32 : VOP3_Real_vi <0x1cf>;
+}
defm V_MIN3_F32 : VOP3_Real_vi <0x1d0>;
defm V_MIN3_I32 : VOP3_Real_vi <0x1d1>;
defm V_MIN3_U32 : VOP3_Real_vi <0x1d2>;
@@ -2313,6 +2346,9 @@ defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16"
defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">;
defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">;
+defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1ce, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">;
+defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1cf, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">;
+
defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">;
defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index e51e957..c812dc9 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -35,14 +35,18 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> {
bit UseTiedOutput = useTiedOutput;
+ defvar Src0RC = getVCSrcForVT<P.Src0VT>.ret;
+ defvar Src1RC = getVCSrcForVT<P.Src1VT>.ret;
+ defvar Src2RC = getVCSrcForVT<P.Src2VT>.ret;
+
dag srcs =
- (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
- FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
- FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+ (ins FP16InputMods:$src0_modifiers, Src0RC:$src0,
+ FP16InputMods:$src1_modifiers, Src1RC:$src1,
+ FP16InputMods:$src2_modifiers, Src2RC:$src2);
dag dpp_srcs =
(ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0,
FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1,
- FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+ FP16InputMods:$src2_modifiers, Src2RC:$src2);
// FIXME: Clamp0 misbehaves with the non-default vdst_in
// following it. For now workaround this by requiring clamp
@@ -144,48 +148,59 @@ def : VOP3PSatPat<usubsat, V_PK_SUB_U16>;
def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>;
} // End SubtargetPredicate = HasVOP3PInsts
-let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in {
+let isCommutable = 1, FPDPRounding = 1 in {
+let SubtargetPredicate = HasMin3Max3PKF16 in {
+defm V_PK_MIN3_NUM_F16 : VOP3PInst<"v_pk_min3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmin3>;
+defm V_PK_MAX3_NUM_F16 : VOP3PInst<"v_pk_max3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmax3>;
+}
+
+let SubtargetPredicate = HasMinimum3Maximum3PKF16 in {
defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfminimum3>;
defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmaximum3>;
}
+} // End isCommutable = 1, FPDPRounding = 1
// TODO: Make sure we're doing the right thing with denormals. Note
// that FMA and MAD will differ.
multiclass MadFmaMixPats<SDPatternOperator fma_like,
Instruction mix_inst,
Instruction mixlo_inst,
- Instruction mixhi_inst> {
+ Instruction mixhi_inst,
+ ValueType VT = f16,
+ ValueType vecVT = v2f16> {
+ defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
+ defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
// At least one of the operands needs to be an fpextend of an f16
// for this to be worthwhile, so we need three patterns here.
// TODO: Could we use a predicate to inspect src1/2/3 instead?
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
(AMDGPUclamp (build_vector
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
- (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))),
+ (vecVT (mixhi_inst $hi_src0_modifiers, $hi_src0,
$hi_src1_modifiers, $hi_src1,
$hi_src2_modifiers, $hi_src2,
DSTCLAMP.ENABLE,
@@ -197,8 +212,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
+ (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
(mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
(i32 0), (i32 0),
@@ -207,9 +222,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
(i32 0), (i32 0),
DSTCLAMP.NONE,
@@ -217,9 +232,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))),
(mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
@@ -234,10 +249,10 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
let True16Predicate = p in {
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
@@ -246,11 +261,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
def : GCNPat <
(build_vector
- f16:$elt0,
- (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ VT:$elt0,
+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.ENABLE,
@@ -261,38 +276,38 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
- (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1),
- (v2f16 (mixlo_inst $src0_modifiers, $src0,
+ (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1),
+ (vecVT (mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, (f16 (IMPLICIT_DEF)), lo16, $elt1, hi16)))
+ (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16)))
>;
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
>;
def : GCNPat <
(build_vector
- f16:$elt0,
- (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ VT:$elt0,
+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.ENABLE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
>;
} // end True16Predicate
}
@@ -353,6 +368,67 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F
defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
}
+let SubtargetPredicate = HasFmaMixBF16Insts in {
+let isCommutable = 1 in {
+
+let isReMaterializable = 1 in
+defm V_FMA_MIX_F32_BF16 : VOP3_VOP3PInst<"v_fma_mix_f32_bf16", VOP3P_Mix_Profile<VOP_F32_BF16_BF16_BF16, VOP3_OPSEL>>;
+
+let FPDPRounding = 1 in {
+defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
+
+let ClampLo = 0, ClampHi = 1 in {
+defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
+}
+} // End FPDPRounding = 1
+} // End isCommutable = 1
+
+defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
+} // End SubtargetPredicate = HasFmaMixBF16Insts
+
+def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
+ let HasModifiers = 0;
+}
+
+let isCommutable = 1, isReMaterializable = 1 in {
+let SubtargetPredicate = HasPkAddMinMaxInsts in {
+defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>;
+defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>;
+defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>;
+defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>;
+}
+let SubtargetPredicate = HasPkMinMax3Insts in {
+defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>;
+defm V_PK_MAX3_U16 : VOP3PInst<"v_pk_max3_u16", PK_ADD_MINMAX_Profile>;
+defm V_PK_MIN3_I16 : VOP3PInst<"v_pk_min3_i16", PK_ADD_MINMAX_Profile>;
+defm V_PK_MIN3_U16 : VOP3PInst<"v_pk_min3_u16", PK_ADD_MINMAX_Profile>;
+}
+} // End isCommutable = 1, isReMaterializable = 1
+
+// TODO: Extend pattern to select op_sel and op_sel_hi.
+class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,
+ VOP3P_Pseudo inst,
+ ValueType vt = inst.Pfl.Src0VT,
+ RegisterOperand RC = getVCSrcForVT<vt>.ret> : GCNPat <
+ (ThreeOpFrag<op1, op2> vt:$src0, vt:$src1, vt:$src2),
+ (inst SRCMODS.OP_SEL_1, RC:$src0, SRCMODS.OP_SEL_1, RC:$src1,
+ SRCMODS.OP_SEL_1, RC:$src2, DSTCLAMP.NONE, 0)
+>;
+
+let SubtargetPredicate = HasPkAddMinMaxInsts in {
+def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>;
+def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>;
+def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>;
+def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>;
+}
+
+let SubtargetPredicate = HasPkMinMax3Insts in {
+def : ThreeOp_OpSelClampPats<smax, smax, V_PK_MAX3_I16>;
+def : ThreeOp_OpSelClampPats<umax, umax, V_PK_MAX3_U16>;
+def : ThreeOp_OpSelClampPats<smin, smin, V_PK_MIN3_I16>;
+def : ThreeOp_OpSelClampPats<umin, umin, V_PK_MIN3_U16>;
+}
+
// Defines patterns that extract signed 4bit from each Idx[0].
foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in
def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src),
@@ -1153,6 +1229,14 @@ let isCommutable = 1, isReMaterializable = 1 in {
let SubtargetPredicate = HasPkMovB32, isAsCheapAsAMove = 1 in
defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
+
+ let SubtargetPredicate = HasBF16PackedInsts in {
+ defm V_PK_ADD_BF16 : VOP3PInst<"v_pk_add_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fadd>;
+ defm V_PK_MUL_BF16 : VOP3PInst<"v_pk_mul_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fmul>;
+ defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>;
+ defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>;
+ defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>;
+ }
} // End isCommutable = 1, isReMaterializable = 1
def : AMDGPUMnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
@@ -1318,13 +1402,15 @@ let WaveSizePredicate = isWave64 in {
class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0,
- bit _HasMatrixReuse = 0, bit _IsF4 = 0>
+ bit _HasMatrixFMT = 0, bit _HasMatrixReuse = 0,
+ bit _IsF4 = 0>
: VOP3P_Profile<VOPProfile<ArgTy>> {
bit IsIU = _IsIU;
bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B
bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32));
int IndexType = _IndexType;
+ let HasMatrixFMT = _HasMatrixFMT;
let HasMatrixReuse = _HasMatrixReuse;
bit HasIModOp = _Has_ImodOp;
@@ -1422,7 +1508,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
!eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
!eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit),
!eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit));
-
+ dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt),
+ (ins));
dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins));
dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins));
dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
@@ -1436,7 +1523,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
(ins VRegSrc_64:$src2),
(ins VRegSrc_32:$src2)),
IndexKey)),
- MatrixReuse, Clamp, Neg);
+ MatrixFMT, MatrixReuse, Clamp, Neg);
// asm
@@ -1444,13 +1531,14 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
!eq(IndexType, 8) : "$index_key_8bit",
!eq(IndexType, 16) : "$index_key_16bit",
!eq(IndexType, 32) : "$index_key_32bit");
+ string MatrxFMTAsm = !if(HasMatrixFMT, "$matrix_a_fmt$matrix_b_fmt", "");
string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", "");
string ClampAsm = !if(HasClamp, "$clamp", "");
string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi",
!and(NegLoAny, !not(NegHiAny)) : "$neg_lo",
!and(!not(NegLoAny), !not(NegHiAny)) : "");
- let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixReuseAsm#NegAsm#ClampAsm;
+ let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrxFMTAsm#MatrixReuseAsm#NegAsm#ClampAsm;
// isel patterns
bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp));
@@ -1462,6 +1550,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
IsAB_BF16_IMod0 : (ins Src0VT:$src0),
IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+ HasMatrixFMT : (ins timm:$matrix_a_fmt, Src0VT:$src0),
NoABMods : (ins Src0VT:$src0));
dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
@@ -1474,6 +1563,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
IsAB_BF16_IMod0 : (ins Src1VT:$src1),
IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+ HasMatrixFMT : (ins timm:$matrix_b_fmt, Src1VT:$src1),
NoABMods : (ins Src1VT:$src1));
dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
@@ -1499,7 +1589,6 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
IsIUXF32 : (ins Src2VT:$src2),
IsSWMMAC : (ins));
dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins));
-
dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
!eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
!eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))),
@@ -1508,6 +1597,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
!eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
!eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit),
!eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit));
+ dag MatrixFMTOutPat = !if(HasMatrixFMT, (ins i32:$matrix_a_fmt, i32:$matrix_b_fmt), (ins));
dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2))));
dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2));
@@ -1515,7 +1605,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins));
dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat);
- dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixReuseOutModPat, ClampPat);
+ dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat);
dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat);
dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat);
@@ -1523,7 +1613,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
// wmma pattern where src2 is inline imm uses _threeaddr pseudo,
// can't use _twoaddr since it would violate src2 tied to vdst constraint.
dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat);
- dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixReuseOutModPat, ClampPat);
+ dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat);
}
def WMMAInstInfoTable : GenericTable {
@@ -1632,26 +1722,45 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1,
// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
// for matrix A, index is i16; Matrix B uses all lanes
-def F64_F64X4_WMMA_w32 : VOP3PWMMA_Profile<[v8f64, v2f64, v2f64, v8f64], 0, 0, 0, 0, 1>;
-def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 1>;
-def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
-def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 1>;
-def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 1>;
-def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 1>;
-def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
-def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1>;
-def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1>;
-def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 1>;
-def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 1>;
-def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 1>;
-def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 1>;
-def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 1>;
-def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 1>;
-def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 1>;
-def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 1>;
-def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 1>;
-def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 1>;
-def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 1>;
+def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 1>;
+def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
+def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
+def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 1>;
+def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 1>;
+def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
+def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 1>;
+def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 1>;
+def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 1>;
+def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 1>;
+def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 1>;
+def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 1>;
+def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 1>;
+def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 1>;
+def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 1>;
+def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 1>;
+def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 1>;
+def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 1>;
+def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 1>;
+
+multiclass WMMA_F8F6F4_Profiles<bit HasMatrixReuse> {
+ def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+ def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+}
+
+defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0>;
+
+multiclass WMMAInst_SrcFormats_mc<string OpName, string Profile> {
+ foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
+ defm _#I#_w32 : WMMAInstGFX12<OpName # "_" # I # "_w32", !cast<VOP3PWMMA_Profile>(Profile # "_" # I # "_w32"), "_w32">;
+ }
+}
let WaveSizePredicate = isWave32 in {
let SubtargetPredicate = isGFX125xOnly in {
@@ -1697,6 +1806,8 @@ defm V_SWMMAC_I32_16X16X128_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x12
defm V_SWMMAC_F32_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_f16", F32_F16X64_SWMMAC_w32, "_w32">;
defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16", F16_F16X64_SWMMAC_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">;
+
} // End is_wmma_xdl = 1.
} // End SubtargetPredicate = isGFX125xOnly
@@ -1854,6 +1965,10 @@ let SubtargetPredicate = isGFX125xOnly in {
defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>;
+ foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
+ defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>;
+ }
+
def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x64_bf16, BF16_BF16X64_SWMMAC_w32>;
def : SWMMACPat<V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
@@ -1912,17 +2027,22 @@ multiclass VOP3P_Real_Base<GFXGen Gen, bits<8> op, string backing_ps_name = NAME
class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
: VOP3Pe_gfx11_gfx12<op, P>{
+
// opsel
- let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0,
+ let Inst{11} = !cond(WMMAP.HasMatrixFMT : matrix_a_fmt{0},
+ !eq(WMMAP.IndexType, 0) : 0,
!eq(WMMAP.IndexType, 8) : index_key_8bit{0},
!eq(WMMAP.IndexType, 16) : index_key_16bit{0},
!eq(WMMAP.IndexType, 32) : index_key_32bit{0});
- let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
- let Inst{13} = !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0);
+ let Inst{12} = !if(WMMAP.HasMatrixFMT, matrix_a_fmt{1},
+ !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0));
+ let Inst{13} = !if (WMMAP.HasMatrixFMT, matrix_a_fmt{2},
+ !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0));
// opsel_hi
- let Inst{59} = 1;
- let Inst{60} = 1;
- let Inst{14} = !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1);
+ let Inst{59} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{0}, 1);
+ let Inst{60} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{1}, 1);
+ let Inst{14} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{2},
+ !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1));
// neg_lo
let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
@@ -1961,6 +2081,24 @@ multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
}
}
+multiclass VOP3P_Real_WMMA_F8F6F4_gfx1250<bits<8> op, VOP3PWMMA_Profile WMMAP> {
+ defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
+ defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
+ defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
+ let AsmString = asmName # PS.AsmOperands in
+ defm NAME : VOP3P_Real_WMMA_gfx1250<op, WMMAP>,
+ MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_twoaddr_gfx1250">;
+}
+
+multiclass VOP3P_Real_WMMA_gfx1250_SrcFormats<bits<8> op, string WMMAP> {
+ defm _f8_f8_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
+ foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
+ let isAsmParserOnly = true in { // Disable ambiguous disassembly.
+ defm _#I#_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
+ }
+ }
+}
+
defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
@@ -2035,6 +2173,8 @@ defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8B
defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>;
defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x033, "F32_16X16X128_F8F6F4">;
+
defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
defm V_SWMMAC_F16_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>;
@@ -2101,6 +2241,8 @@ multiclass VOP3P_Realtriple_gfx11_gfx12<bits<8> op>
multiclass VOP3P_Real_gfx12<bits<8> op> : VOP3P_Real_Base<GFX12Gen, op>;
+multiclass VOP3P_Real_gfx1250<bits<8> op> : VOP3P_Real_Base<GFX1250Gen, op>;
+
multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
string backing_ps_name = NAME,
string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> :
@@ -2109,6 +2251,35 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">;
defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">;
+defm V_PK_FMA_F32 : VOP3P_Real_gfx12<0x1f>;
+defm V_PK_MUL_F32 : VOP3P_Real_gfx12<0x28>;
+defm V_PK_ADD_F32 : VOP3P_Real_gfx12<0x29>;
+
+defm V_PK_ADD_MAX_I16 : VOP3P_Real_gfx1250<0x14>;
+defm V_PK_ADD_MAX_U16 : VOP3P_Real_gfx1250<0x15>;
+defm V_PK_ADD_MIN_I16 : VOP3P_Real_gfx1250<0x2d>;
+defm V_PK_ADD_MIN_U16 : VOP3P_Real_gfx1250<0x2e>;
+defm V_PK_MAX3_I16 : VOP3P_Real_gfx1250<0x2f>;
+defm V_PK_MAX3_U16 : VOP3P_Real_gfx1250<0x30>;
+defm V_PK_MIN3_I16 : VOP3P_Real_gfx1250<0x31>;
+defm V_PK_MIN3_U16 : VOP3P_Real_gfx1250<0x32>;
+defm V_PK_FMA_BF16 : VOP3P_Real_gfx1250<0x11>;
+defm V_PK_ADD_BF16 : VOP3P_Real_gfx1250<0x23>;
+defm V_PK_MUL_BF16 : VOP3P_Real_gfx1250<0x2a>;
+defm V_PK_MIN_NUM_BF16 : VOP3P_Real_gfx1250<0x2b>;
+defm V_PK_MAX_NUM_BF16 : VOP3P_Real_gfx1250<0x2c>;
+defm V_PK_MINIMUM3_F16 : VOP3P_Real_gfx1250<0x36>;
+defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>;
+defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>;
+defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>;
+
+defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>;
+defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>;
+defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>;
+
+let AssemblerPredicate = isGFX1250Plus in
+def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">;
+
defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index a25ebdf..c21e2d3 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -453,6 +453,8 @@ class VOP3Pe_Base {
bits<2> index_key_8bit;
bits<1> index_key_16bit;
bits<1> index_key_32bit;
+ bits<3> matrix_a_fmt;
+ bits<3> matrix_b_fmt;
bits<1> matrix_a_reuse;
bits<1> matrix_b_reuse;
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index fd3b052..fca5dff 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -20347,6 +20347,13 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
return weight;
}
+static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
+ if (PR == 0 || VT == MVT::Other)
+ return false;
+ return (ARM::SPRRegClass.contains(PR) && VT != MVT::f32 && VT != MVT::i32) ||
+ (ARM::DPRRegClass.contains(PR) && VT != MVT::f64);
+}
+
using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
@@ -20420,7 +20427,10 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
if (StringRef("{cc}").equals_insensitive(Constraint))
return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
- return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+ auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+ if (isIncompatibleReg(RCP.first, VT))
+ return {0, nullptr};
+ return RCP;
}
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
@@ -21731,11 +21741,16 @@ bool ARMTargetLowering::lowerInterleavedLoad(
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
-bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool ARMTargetLowering::lowerInterleavedStore(Instruction *Store,
+ Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
+ auto *SI = dyn_cast<StoreInst>(Store);
+ if (!SI)
+ return false;
+ assert(!LaneMask && "Unexpected mask on store");
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9159f3d..825145d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -685,7 +685,8 @@ class VectorType;
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+ ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool shouldInsertFencesForAtomic(const Instruction *I) const override;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index eaba6fe..a7a9911 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -593,7 +593,7 @@ public:
getContext().reportError(Loc, "relocated expression must be 32-bit");
return;
}
- getOrCreateDataFragment();
+ getCurrentFragment();
}
emitDataMappingSymbol();
@@ -1207,7 +1207,7 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
}
void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
- MCFragment *Frag = getOrCreateDataFragment();
+ MCFragment *Frag = getCurrentFragment();
Frag->addFixup(MCFixup::create(Frag->getContents().size(), Expr, Kind));
}
@@ -1295,7 +1295,7 @@ void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext());
visitUsedExpr(*PersonalityRef);
- MCFragment *DF = getOrCreateDataFragment();
+ MCFragment *DF = getCurrentFragment();
DF->addFixup(
MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4));
}
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index db09738..128cc0b 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -514,19 +514,7 @@ bool AVRAsmBackend::forceRelocation(const MCFragment &F, const MCFixup &Fixup,
return false;
case AVR::fixup_7_pcrel:
- case AVR::fixup_13_pcrel: {
- uint64_t Offset = Target.getConstant();
- uint64_t Size = AVRAsmBackend::getFixupKindInfo(Fixup.getKind()).TargetSize;
-
- // If the jump is too large to encode it, fall back to a relocation.
- //
- // Note that trying to actually link that relocation *would* fail, but the
- // hopes are that the module we're currently compiling won't be actually
- // linked to the final binary.
- return !adjust::adjustRelativeBranch(Size, Fixup, Offset,
- getContext().getSubtargetInfo());
- }
-
+ case AVR::fixup_13_pcrel:
case AVR::fixup_call:
return true;
}
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 5963976..6ec78d0 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -7,12 +7,10 @@
//===----------------------------------------------------------------------===//
#include "AVRMCExpr.h"
-#include "MCTargetDesc/AVRMCAsmInfo.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCValue.h"
namespace llvm {
diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h
index 5d49949..7faae8b 100644
--- a/llvm/lib/Target/BPF/BPF.h
+++ b/llvm/lib/Target/BPF/BPF.h
@@ -22,7 +22,7 @@ class BPFTargetMachine;
class InstructionSelector;
class PassRegistry;
-static const char *BPF_TRAP = "__bpf_trap";
+#define BPF_TRAP "__bpf_trap"
ModulePass *createBPFCheckAndAdjustIR();
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
index a0011e8..fa9007e 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
@@ -16,7 +16,6 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index d9d9b36..feecfc0 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -301,41 +301,53 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
}
bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
- Value *PtrOperand = GEPI.getPointerOperand();
- Type *OrigGEPType = GEPI.getSourceElementType();
- Type *NewGEPType = OrigGEPType;
+ GEPOperator *GOp = cast<GEPOperator>(&GEPI);
+ Value *PtrOperand = GOp->getPointerOperand();
+ Type *NewGEPType = GOp->getSourceElementType();
bool NeedsTransform = false;
+ // Unwrap GEP ConstantExprs to find the base operand and element type
+ while (auto *CE = dyn_cast<ConstantExpr>(PtrOperand)) {
+ if (auto *GEPCE = dyn_cast<GEPOperator>(CE)) {
+ GOp = GEPCE;
+ PtrOperand = GEPCE->getPointerOperand();
+ NewGEPType = GEPCE->getSourceElementType();
+ } else
+ break;
+ }
+
if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand)) {
NewGEPType = NewGlobal->getValueType();
PtrOperand = NewGlobal;
NeedsTransform = true;
} else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) {
Type *AllocatedType = Alloca->getAllocatedType();
- // Only transform if the allocated type is an array
- if (AllocatedType != OrigGEPType && isa<ArrayType>(AllocatedType)) {
+ if (isa<ArrayType>(AllocatedType) &&
+ AllocatedType != GOp->getResultElementType()) {
NewGEPType = AllocatedType;
NeedsTransform = true;
}
}
- // Scalar geps should remain scalars geps. The dxil-flatten-arrays pass will
- // convert these scalar geps into flattened array geps
- if (!isa<ArrayType>(OrigGEPType))
- NewGEPType = OrigGEPType;
-
- // Note: We bail if this isn't a gep touched via alloca or global
- // transformations
if (!NeedsTransform)
return false;
- IRBuilder<> Builder(&GEPI);
- SmallVector<Value *, MaxVecSize> Indices(GEPI.indices());
+ // Keep scalar GEPs scalar; dxil-flatten-arrays will do flattening later
+ if (!isa<ArrayType>(GOp->getSourceElementType()))
+ NewGEPType = GOp->getSourceElementType();
+ IRBuilder<> Builder(&GEPI);
+ SmallVector<Value *, MaxVecSize> Indices(GOp->indices());
Value *NewGEP = Builder.CreateGEP(NewGEPType, PtrOperand, Indices,
- GEPI.getName(), GEPI.getNoWrapFlags());
- GEPI.replaceAllUsesWith(NewGEP);
- GEPI.eraseFromParent();
+ GOp->getName(), GOp->getNoWrapFlags());
+
+ GOp->replaceAllUsesWith(NewGEP);
+
+ if (auto *CE = dyn_cast<ConstantExpr>(GOp))
+ CE->destroyConstant();
+ else if (auto *OldGEPI = dyn_cast<GetElementPtrInst>(GOp))
+ OldGEPI->eraseFromParent();
+
return true;
}
diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index f0e2e78..7e1436e 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -263,8 +263,13 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
// merge the byte offsets. Otherwise, this GEP is itself the root of a GEP
// chain and we need to deterine the root array type
if (auto *PtrOpGEP = dyn_cast<GEPOperator>(PtrOperand)) {
- assert(GEPChainInfoMap.contains(PtrOpGEP) &&
- "Expected parent GEP to be visited before this GEP");
+
+ // If the parent GEP was not processed, then we do not want to process its
+ // descendants. This can happen if the GEP chain is for an unsupported type
+ // such as a struct -- we do not flatten structs nor GEP chains for structs
+ if (!GEPChainInfoMap.contains(PtrOpGEP))
+ return false;
+
GEPInfo &PGEPInfo = GEPChainInfoMap[PtrOpGEP];
Info.RootFlattenedArrayType = PGEPInfo.RootFlattenedArrayType;
Info.RootPointerOperand = PGEPInfo.RootPointerOperand;
diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
index c73648f..3427968 100644
--- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
+++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
@@ -24,18 +24,19 @@
using namespace llvm;
-static void legalizeFreeze(Instruction &I,
+static bool legalizeFreeze(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *>) {
auto *FI = dyn_cast<FreezeInst>(&I);
if (!FI)
- return;
+ return false;
FI->replaceAllUsesWith(FI->getOperand(0));
ToRemove.push_back(FI);
+ return true;
}
-static void fixI8UseChain(Instruction &I,
+static bool fixI8UseChain(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
@@ -74,19 +75,19 @@ static void fixI8UseChain(Instruction &I,
if (Trunc->getDestTy()->isIntegerTy(8)) {
ReplacedValues[Trunc] = Trunc->getOperand(0);
ToRemove.push_back(Trunc);
- return;
+ return true;
}
}
if (auto *Store = dyn_cast<StoreInst>(&I)) {
if (!Store->getValueOperand()->getType()->isIntegerTy(8))
- return;
+ return false;
SmallVector<Value *> NewOperands;
ProcessOperands(NewOperands);
Value *NewStore = Builder.CreateStore(NewOperands[0], NewOperands[1]);
ReplacedValues[Store] = NewStore;
ToRemove.push_back(Store);
- return;
+ return true;
}
if (auto *Load = dyn_cast<LoadInst>(&I);
@@ -104,17 +105,17 @@ static void fixI8UseChain(Instruction &I,
LoadInst *NewLoad = Builder.CreateLoad(ElementType, NewOperands[0]);
ReplacedValues[Load] = NewLoad;
ToRemove.push_back(Load);
- return;
+ return true;
}
if (auto *Load = dyn_cast<LoadInst>(&I);
Load && isa<ConstantExpr>(Load->getPointerOperand())) {
auto *CE = dyn_cast<ConstantExpr>(Load->getPointerOperand());
if (!(CE->getOpcode() == Instruction::GetElementPtr))
- return;
+ return false;
auto *GEP = dyn_cast<GEPOperator>(CE);
if (!GEP->getSourceElementType()->isIntegerTy(8))
- return;
+ return false;
Type *ElementType = Load->getType();
ConstantInt *Offset = dyn_cast<ConstantInt>(GEP->getOperand(1));
@@ -143,12 +144,12 @@ static void fixI8UseChain(Instruction &I,
ReplacedValues[Load] = NewLoad;
Load->replaceAllUsesWith(NewLoad);
ToRemove.push_back(Load);
- return;
+ return true;
}
if (auto *BO = dyn_cast<BinaryOperator>(&I)) {
if (!I.getType()->isIntegerTy(8))
- return;
+ return false;
SmallVector<Value *> NewOperands;
ProcessOperands(NewOperands);
Value *NewInst =
@@ -162,24 +163,24 @@ static void fixI8UseChain(Instruction &I,
}
ReplacedValues[BO] = NewInst;
ToRemove.push_back(BO);
- return;
+ return true;
}
if (auto *Sel = dyn_cast<SelectInst>(&I)) {
if (!I.getType()->isIntegerTy(8))
- return;
+ return false;
SmallVector<Value *> NewOperands;
ProcessOperands(NewOperands);
Value *NewInst = Builder.CreateSelect(Sel->getCondition(), NewOperands[1],
NewOperands[2]);
ReplacedValues[Sel] = NewInst;
ToRemove.push_back(Sel);
- return;
+ return true;
}
if (auto *Cmp = dyn_cast<CmpInst>(&I)) {
if (!Cmp->getOperand(0)->getType()->isIntegerTy(8))
- return;
+ return false;
SmallVector<Value *> NewOperands;
ProcessOperands(NewOperands);
Value *NewInst =
@@ -187,18 +188,18 @@ static void fixI8UseChain(Instruction &I,
Cmp->replaceAllUsesWith(NewInst);
ReplacedValues[Cmp] = NewInst;
ToRemove.push_back(Cmp);
- return;
+ return true;
}
if (auto *Cast = dyn_cast<CastInst>(&I)) {
if (!Cast->getSrcTy()->isIntegerTy(8))
- return;
+ return false;
ToRemove.push_back(Cast);
auto *Replacement = ReplacedValues[Cast->getOperand(0)];
if (Cast->getType() == Replacement->getType()) {
Cast->replaceAllUsesWith(Replacement);
- return;
+ return true;
}
Value *AdjustedCast = nullptr;
@@ -213,7 +214,7 @@ static void fixI8UseChain(Instruction &I,
if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
if (!GEP->getType()->isPointerTy() ||
!GEP->getSourceElementType()->isIntegerTy(8))
- return;
+ return false;
Value *BasePtr = GEP->getPointerOperand();
if (ReplacedValues.count(BasePtr))
@@ -248,15 +249,17 @@ static void fixI8UseChain(Instruction &I,
ReplacedValues[GEP] = NewGEP;
GEP->replaceAllUsesWith(NewGEP);
ToRemove.push_back(GEP);
+ return true;
}
+ return false;
}
-static void upcastI8AllocasAndUses(Instruction &I,
+static bool upcastI8AllocasAndUses(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
auto *AI = dyn_cast<AllocaInst>(&I);
if (!AI || !AI->getAllocatedType()->isIntegerTy(8))
- return;
+ return false;
Type *SmallestType = nullptr;
@@ -291,16 +294,17 @@ static void upcastI8AllocasAndUses(Instruction &I,
}
if (!SmallestType)
- return; // no valid casts found
+ return false; // no valid casts found
// Replace alloca
IRBuilder<> Builder(AI);
auto *NewAlloca = Builder.CreateAlloca(SmallestType);
ReplacedValues[AI] = NewAlloca;
ToRemove.push_back(AI);
+ return true;
}
-static void
+static bool
downcastI64toI32InsertExtractElements(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &) {
@@ -318,6 +322,7 @@ downcastI64toI32InsertExtractElements(Instruction &I,
Extract->replaceAllUsesWith(NewExtract);
ToRemove.push_back(Extract);
+ return true;
}
}
@@ -335,8 +340,10 @@ downcastI64toI32InsertExtractElements(Instruction &I,
Insert->replaceAllUsesWith(Insert32Index);
ToRemove.push_back(Insert);
+ return true;
}
}
+ return false;
}
static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src,
@@ -453,17 +460,17 @@ static void emitMemsetExpansion(IRBuilder<> &Builder, Value *Dst, Value *Val,
// Expands the instruction `I` into corresponding loads and stores if it is a
// memcpy call. In that case, the call instruction is added to the `ToRemove`
// vector. `ReplacedValues` is unused.
-static void legalizeMemCpy(Instruction &I,
+static bool legalizeMemCpy(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
CallInst *CI = dyn_cast<CallInst>(&I);
if (!CI)
- return;
+ return false;
Intrinsic::ID ID = CI->getIntrinsicID();
if (ID != Intrinsic::memcpy)
- return;
+ return false;
IRBuilder<> Builder(&I);
Value *Dst = CI->getArgOperand(0);
@@ -476,19 +483,20 @@ static void legalizeMemCpy(Instruction &I,
assert(IsVolatile->getZExtValue() == 0 && "Expected IsVolatile to be false");
emitMemcpyExpansion(Builder, Dst, Src, Length);
ToRemove.push_back(CI);
+ return true;
}
-static void legalizeMemSet(Instruction &I,
+static bool legalizeMemSet(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
CallInst *CI = dyn_cast<CallInst>(&I);
if (!CI)
- return;
+ return false;
Intrinsic::ID ID = CI->getIntrinsicID();
if (ID != Intrinsic::memset)
- return;
+ return false;
IRBuilder<> Builder(&I);
Value *Dst = CI->getArgOperand(0);
@@ -497,23 +505,25 @@ static void legalizeMemSet(Instruction &I,
assert(Size && "Expected Size to be a ConstantInt");
emitMemsetExpansion(Builder, Dst, Val, Size, ReplacedValues);
ToRemove.push_back(CI);
+ return true;
}
-static void updateFnegToFsub(Instruction &I,
+static bool updateFnegToFsub(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &) {
const Intrinsic::ID ID = I.getOpcode();
if (ID != Instruction::FNeg)
- return;
+ return false;
IRBuilder<> Builder(&I);
Value *In = I.getOperand(0);
Value *Zero = ConstantFP::get(In->getType(), -0.0);
I.replaceAllUsesWith(Builder.CreateFSub(Zero, In));
ToRemove.push_back(&I);
+ return true;
}
-static void
+static bool
legalizeGetHighLowi64Bytes(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
@@ -523,13 +533,13 @@ legalizeGetHighLowi64Bytes(Instruction &I,
BitCast->getSrcTy()->isIntegerTy(64)) {
ToRemove.push_back(BitCast);
ReplacedValues[BitCast] = BitCast->getOperand(0);
- return;
+ return true;
}
}
if (auto *Extract = dyn_cast<ExtractElementInst>(&I)) {
if (!dyn_cast<BitCastInst>(Extract->getVectorOperand()))
- return;
+ return false;
auto *VecTy = dyn_cast<FixedVectorType>(Extract->getVectorOperandType());
if (VecTy && VecTy->getElementType()->isIntegerTy(32) &&
VecTy->getNumElements() == 2) {
@@ -557,12 +567,14 @@ legalizeGetHighLowi64Bytes(Instruction &I,
}
ToRemove.push_back(Extract);
Extract->replaceAllUsesWith(ReplacedValues[Extract]);
+ return true;
}
}
}
+ return false;
}
-static void
+static bool
legalizeScalarLoadStoreOnArrays(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &) {
@@ -579,14 +591,14 @@ legalizeScalarLoadStoreOnArrays(Instruction &I,
PtrOpIndex = SI->getPointerOperandIndex();
LoadStoreTy = SI->getValueOperand()->getType();
} else
- return;
+ return false;
// If the load/store is not of a single-value type (i.e., scalar or vector)
// then we do not modify it. It shouldn't be a vector either because the
// dxil-data-scalarization pass is expected to run before this, but it's not
// incorrect to apply this transformation to vector load/stores.
if (!LoadStoreTy->isSingleValueType())
- return;
+ return false;
Type *ArrayTy;
if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp))
@@ -594,10 +606,10 @@ legalizeScalarLoadStoreOnArrays(Instruction &I,
else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp))
ArrayTy = AllocaPtrOp->getAllocatedType();
else
- return;
+ return false;
if (!isa<ArrayType>(ArrayTy))
- return;
+ return false;
assert(ArrayTy->getArrayElementType() == LoadStoreTy &&
"Expected array element type to be the same as to the scalar load or "
@@ -607,6 +619,7 @@ legalizeScalarLoadStoreOnArrays(Instruction &I,
Value *GEP = GetElementPtrInst::Create(
ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator());
I.setOperand(PtrOpIndex, GEP);
+ return true;
}
namespace {
@@ -624,13 +637,11 @@ public:
ReplacedValues.clear();
for (auto &I : instructions(F)) {
for (auto &LegalizationFn : LegalizationPipeline[Stage])
- LegalizationFn(I, ToRemove, ReplacedValues);
+ MadeChange |= LegalizationFn(I, ToRemove, ReplacedValues);
}
for (auto *Inst : reverse(ToRemove))
Inst->eraseFromParent();
-
- MadeChange |= !ToRemove.empty();
}
return MadeChange;
}
@@ -639,7 +650,7 @@ private:
enum LegalizationStage { Stage1 = 0, Stage2 = 1, NumStages };
using LegalizationFnTy =
- std::function<void(Instruction &, SmallVectorImpl<Instruction *> &,
+ std::function<bool(Instruction &, SmallVectorImpl<Instruction *> &,
DenseMap<Value *, Value *> &)>;
SmallVector<LegalizationFnTy> LegalizationPipeline[NumStages];
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index 703a9e5..c8866bf 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -24,7 +24,6 @@
#include "llvm/IR/AttributeMask.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -240,11 +239,6 @@ public:
for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx)
F.removeParamAttrs(Idx, AttrMask);
- // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr
- if (Intrinsic::ID IID = F.getIntrinsicID();
- IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end)
- F.removeFnAttr(Attribute::Memory);
-
for (auto &BB : F) {
IRBuilder<> Builder(&BB);
for (auto &I : make_early_inc_range(BB)) {
@@ -253,7 +247,7 @@ public:
// Emtting NoOp bitcast instructions allows the ValueEnumerator to be
// unmodified as it reserves instruction IDs during contruction.
- if (auto *LI = dyn_cast<LoadInst>(&I)) {
+ if (auto LI = dyn_cast<LoadInst>(&I)) {
if (Value *NoOpBitcast = maybeGenerateBitcast(
Builder, PointerTypes, I, LI->getPointerOperand(),
LI->getType())) {
@@ -263,7 +257,7 @@ public:
}
continue;
}
- if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ if (auto SI = dyn_cast<StoreInst>(&I)) {
if (Value *NoOpBitcast = maybeGenerateBitcast(
Builder, PointerTypes, I, SI->getPointerOperand(),
SI->getValueOperand()->getType())) {
@@ -274,7 +268,7 @@ public:
}
continue;
}
- if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ if (auto GEP = dyn_cast<GetElementPtrInst>(&I)) {
if (Value *NoOpBitcast = maybeGenerateBitcast(
Builder, PointerTypes, I, GEP->getPointerOperand(),
GEP->getSourceElementType()))
@@ -286,17 +280,6 @@ public:
CB->removeRetAttrs(AttrMask);
for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx)
CB->removeParamAttrs(Idx, AttrMask);
- // LLVM 3.7 Lifetime intrinics require an i8* pointer operand, so we
- // insert a bitcast here to ensure that is the case
- if (isa<LifetimeIntrinsic>(CB)) {
- Value *PtrOperand = CB->getArgOperand(1);
- Builder.SetInsertPoint(CB);
- PointerType *PtrTy = cast<PointerType>(PtrOperand->getType());
- Value *NoOpBitcast = Builder.Insert(
- CastInst::Create(Instruction::BitCast, PtrOperand,
- Builder.getPtrTy(PtrTy->getAddressSpace())));
- CB->setArgOperand(1, NoOpBitcast);
- }
continue;
}
}
diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
index 566f3a9..c33ec0e 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
@@ -241,7 +241,6 @@ static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) {
}
static bool transformResourcePointers(Function &F, DXILResourceTypeMap &DRTM) {
- bool Changed = false;
SmallVector<std::pair<IntrinsicInst *, dxil::ResourceTypeInfo>> Resources;
for (BasicBlock &BB : F)
for (Instruction &I : BB)
@@ -254,7 +253,7 @@ static bool transformResourcePointers(Function &F, DXILResourceTypeMap &DRTM) {
for (auto &[II, RI] : Resources)
replaceAccess(II, RI);
- return Changed;
+ return !Resources.empty();
}
PreservedAnalyses DXILResourceAccess::run(Function &F,
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
index dfc8162..ebdfcaa 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/DXILMetadataAnalysis.h"
#include "llvm/BinaryFormat/DXContainer.h"
+#include "llvm/Frontend/HLSL/RootSignatureMetadata.h"
#include "llvm/Frontend/HLSL/RootSignatureValidations.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DiagnosticInfo.h"
@@ -29,25 +30,10 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
-#include <optional>
-#include <utility>
using namespace llvm;
using namespace llvm::dxil;
-static bool reportError(LLVMContext *Ctx, Twine Message,
- DiagnosticSeverity Severity = DS_Error) {
- Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity));
- return true;
-}
-
-static bool reportValueError(LLVMContext *Ctx, Twine ParamName,
- uint32_t Value) {
- Ctx->diagnose(DiagnosticInfoGeneric(
- "Invalid value for " + ParamName + ": " + Twine(Value), DS_Error));
- return true;
-}
-
static std::optional<uint32_t> extractMdIntValue(MDNode *Node,
unsigned int OpId) {
if (auto *CI =
@@ -56,453 +42,10 @@ static std::optional<uint32_t> extractMdIntValue(MDNode *Node,
return std::nullopt;
}
-static std::optional<float> extractMdFloatValue(MDNode *Node,
- unsigned int OpId) {
- if (auto *CI = mdconst::dyn_extract<ConstantFP>(Node->getOperand(OpId).get()))
- return CI->getValueAPF().convertToFloat();
- return std::nullopt;
-}
-
-static std::optional<StringRef> extractMdStringValue(MDNode *Node,
- unsigned int OpId) {
- MDString *NodeText = dyn_cast<MDString>(Node->getOperand(OpId));
- if (NodeText == nullptr)
- return std::nullopt;
- return NodeText->getString();
-}
-
-static bool parseRootFlags(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *RootFlagNode) {
-
- if (RootFlagNode->getNumOperands() != 2)
- return reportError(Ctx, "Invalid format for RootFlag Element");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootFlagNode, 1))
- RSD.Flags = *Val;
- else
- return reportError(Ctx, "Invalid value for RootFlag");
-
- return false;
-}
-
-static bool parseRootConstants(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *RootConstantNode) {
-
- if (RootConstantNode->getNumOperands() != 5)
- return reportError(Ctx, "Invalid format for RootConstants Element");
-
- dxbc::RTS0::v1::RootParameterHeader Header;
- // The parameter offset doesn't matter here - we recalculate it during
- // serialization Header.ParameterOffset = 0;
- Header.ParameterType =
- llvm::to_underlying(dxbc::RootParameterType::Constants32Bit);
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 1))
- Header.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- dxbc::RTS0::v1::RootConstants Constants;
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 2))
- Constants.ShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 3))
- Constants.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 4))
- Constants.Num32BitValues = *Val;
- else
- return reportError(Ctx, "Invalid value for Num32BitValues");
-
- RSD.ParametersContainer.addParameter(Header, Constants);
-
- return false;
-}
-
-static bool parseRootDescriptors(LLVMContext *Ctx,
- mcdxbc::RootSignatureDesc &RSD,
- MDNode *RootDescriptorNode,
- RootSignatureElementKind ElementKind) {
- assert(ElementKind == RootSignatureElementKind::SRV ||
- ElementKind == RootSignatureElementKind::UAV ||
- ElementKind == RootSignatureElementKind::CBV &&
- "parseRootDescriptors should only be called with RootDescriptor "
- "element kind.");
- if (RootDescriptorNode->getNumOperands() != 5)
- return reportError(Ctx, "Invalid format for Root Descriptor Element");
-
- dxbc::RTS0::v1::RootParameterHeader Header;
- switch (ElementKind) {
- case RootSignatureElementKind::SRV:
- Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::SRV);
- break;
- case RootSignatureElementKind::UAV:
- Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::UAV);
- break;
- case RootSignatureElementKind::CBV:
- Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::CBV);
- break;
- default:
- llvm_unreachable("invalid Root Descriptor kind");
- break;
- }
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 1))
- Header.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- dxbc::RTS0::v2::RootDescriptor Descriptor;
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 2))
- Descriptor.ShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 3))
- Descriptor.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (RSD.Version == 1) {
- RSD.ParametersContainer.addParameter(Header, Descriptor);
- return false;
- }
- assert(RSD.Version > 1);
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 4))
- Descriptor.Flags = *Val;
- else
- return reportError(Ctx, "Invalid value for Root Descriptor Flags");
-
- RSD.ParametersContainer.addParameter(Header, Descriptor);
- return false;
-}
-
-static bool parseDescriptorRange(LLVMContext *Ctx,
- mcdxbc::DescriptorTable &Table,
- MDNode *RangeDescriptorNode) {
-
- if (RangeDescriptorNode->getNumOperands() != 6)
- return reportError(Ctx, "Invalid format for Descriptor Range");
-
- dxbc::RTS0::v2::DescriptorRange Range;
-
- std::optional<StringRef> ElementText =
- extractMdStringValue(RangeDescriptorNode, 0);
-
- if (!ElementText.has_value())
- return reportError(Ctx, "Descriptor Range, first element is not a string.");
-
- Range.RangeType =
- StringSwitch<uint32_t>(*ElementText)
- .Case("CBV", llvm::to_underlying(dxbc::DescriptorRangeType::CBV))
- .Case("SRV", llvm::to_underlying(dxbc::DescriptorRangeType::SRV))
- .Case("UAV", llvm::to_underlying(dxbc::DescriptorRangeType::UAV))
- .Case("Sampler",
- llvm::to_underlying(dxbc::DescriptorRangeType::Sampler))
- .Default(~0U);
-
- if (Range.RangeType == ~0U)
- return reportError(Ctx, "Invalid Descriptor Range type: " + *ElementText);
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 1))
- Range.NumDescriptors = *Val;
- else
- return reportError(Ctx, "Invalid value for Number of Descriptor in Range");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 2))
- Range.BaseShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for BaseShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 3))
- Range.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 4))
- Range.OffsetInDescriptorsFromTableStart = *Val;
- else
- return reportError(Ctx,
- "Invalid value for OffsetInDescriptorsFromTableStart");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 5))
- Range.Flags = *Val;
- else
- return reportError(Ctx, "Invalid value for Descriptor Range Flags");
-
- Table.Ranges.push_back(Range);
- return false;
-}
-
-static bool parseDescriptorTable(LLVMContext *Ctx,
- mcdxbc::RootSignatureDesc &RSD,
- MDNode *DescriptorTableNode) {
- const unsigned int NumOperands = DescriptorTableNode->getNumOperands();
- if (NumOperands < 2)
- return reportError(Ctx, "Invalid format for Descriptor Table");
-
- dxbc::RTS0::v1::RootParameterHeader Header;
- if (std::optional<uint32_t> Val = extractMdIntValue(DescriptorTableNode, 1))
- Header.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- mcdxbc::DescriptorTable Table;
- Header.ParameterType =
- llvm::to_underlying(dxbc::RootParameterType::DescriptorTable);
-
- for (unsigned int I = 2; I < NumOperands; I++) {
- MDNode *Element = dyn_cast<MDNode>(DescriptorTableNode->getOperand(I));
- if (Element == nullptr)
- return reportError(Ctx, "Missing Root Element Metadata Node.");
-
- if (parseDescriptorRange(Ctx, Table, Element))
- return true;
- }
-
- RSD.ParametersContainer.addParameter(Header, Table);
- return false;
-}
-
-static bool parseStaticSampler(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *StaticSamplerNode) {
- if (StaticSamplerNode->getNumOperands() != 14)
- return reportError(Ctx, "Invalid format for Static Sampler");
-
- dxbc::RTS0::v1::StaticSampler Sampler;
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 1))
- Sampler.Filter = *Val;
- else
- return reportError(Ctx, "Invalid value for Filter");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 2))
- Sampler.AddressU = *Val;
- else
- return reportError(Ctx, "Invalid value for AddressU");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 3))
- Sampler.AddressV = *Val;
- else
- return reportError(Ctx, "Invalid value for AddressV");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 4))
- Sampler.AddressW = *Val;
- else
- return reportError(Ctx, "Invalid value for AddressW");
-
- if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 5))
- Sampler.MipLODBias = *Val;
- else
- return reportError(Ctx, "Invalid value for MipLODBias");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 6))
- Sampler.MaxAnisotropy = *Val;
- else
- return reportError(Ctx, "Invalid value for MaxAnisotropy");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 7))
- Sampler.ComparisonFunc = *Val;
- else
- return reportError(Ctx, "Invalid value for ComparisonFunc ");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 8))
- Sampler.BorderColor = *Val;
- else
- return reportError(Ctx, "Invalid value for ComparisonFunc ");
-
- if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 9))
- Sampler.MinLOD = *Val;
- else
- return reportError(Ctx, "Invalid value for MinLOD");
-
- if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 10))
- Sampler.MaxLOD = *Val;
- else
- return reportError(Ctx, "Invalid value for MaxLOD");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 11))
- Sampler.ShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 12))
- Sampler.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 13))
- Sampler.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- RSD.StaticSamplers.push_back(Sampler);
- return false;
-}
-
-static bool parseRootSignatureElement(LLVMContext *Ctx,
- mcdxbc::RootSignatureDesc &RSD,
- MDNode *Element) {
- std::optional<StringRef> ElementText = extractMdStringValue(Element, 0);
- if (!ElementText.has_value())
- return reportError(Ctx, "Invalid format for Root Element");
-
- RootSignatureElementKind ElementKind =
- StringSwitch<RootSignatureElementKind>(*ElementText)
- .Case("RootFlags", RootSignatureElementKind::RootFlags)
- .Case("RootConstants", RootSignatureElementKind::RootConstants)
- .Case("RootCBV", RootSignatureElementKind::CBV)
- .Case("RootSRV", RootSignatureElementKind::SRV)
- .Case("RootUAV", RootSignatureElementKind::UAV)
- .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable)
- .Case("StaticSampler", RootSignatureElementKind::StaticSamplers)
- .Default(RootSignatureElementKind::Error);
-
- switch (ElementKind) {
-
- case RootSignatureElementKind::RootFlags:
- return parseRootFlags(Ctx, RSD, Element);
- case RootSignatureElementKind::RootConstants:
- return parseRootConstants(Ctx, RSD, Element);
- case RootSignatureElementKind::CBV:
- case RootSignatureElementKind::SRV:
- case RootSignatureElementKind::UAV:
- return parseRootDescriptors(Ctx, RSD, Element, ElementKind);
- case RootSignatureElementKind::DescriptorTable:
- return parseDescriptorTable(Ctx, RSD, Element);
- case RootSignatureElementKind::StaticSamplers:
- return parseStaticSampler(Ctx, RSD, Element);
- case RootSignatureElementKind::Error:
- return reportError(Ctx, "Invalid Root Signature Element: " + *ElementText);
- }
-
- llvm_unreachable("Unhandled RootSignatureElementKind enum.");
-}
-
-static bool parse(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *Node) {
- bool HasError = false;
-
- // Loop through the Root Elements of the root signature.
- for (const auto &Operand : Node->operands()) {
- MDNode *Element = dyn_cast<MDNode>(Operand);
- if (Element == nullptr)
- return reportError(Ctx, "Missing Root Element Metadata Node.");
-
- HasError = HasError || parseRootSignatureElement(Ctx, RSD, Element);
- }
-
- return HasError;
-}
-
-static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) {
-
- if (!llvm::hlsl::rootsig::verifyVersion(RSD.Version)) {
- return reportValueError(Ctx, "Version", RSD.Version);
- }
-
- if (!llvm::hlsl::rootsig::verifyRootFlag(RSD.Flags)) {
- return reportValueError(Ctx, "RootFlags", RSD.Flags);
- }
-
- for (const mcdxbc::RootParameterInfo &Info : RSD.ParametersContainer) {
- if (!dxbc::isValidShaderVisibility(Info.Header.ShaderVisibility))
- return reportValueError(Ctx, "ShaderVisibility",
- Info.Header.ShaderVisibility);
-
- assert(dxbc::isValidParameterType(Info.Header.ParameterType) &&
- "Invalid value for ParameterType");
-
- switch (Info.Header.ParameterType) {
-
- case llvm::to_underlying(dxbc::RootParameterType::CBV):
- case llvm::to_underlying(dxbc::RootParameterType::UAV):
- case llvm::to_underlying(dxbc::RootParameterType::SRV): {
- const dxbc::RTS0::v2::RootDescriptor &Descriptor =
- RSD.ParametersContainer.getRootDescriptor(Info.Location);
- if (!llvm::hlsl::rootsig::verifyRegisterValue(Descriptor.ShaderRegister))
- return reportValueError(Ctx, "ShaderRegister",
- Descriptor.ShaderRegister);
-
- if (!llvm::hlsl::rootsig::verifyRegisterSpace(Descriptor.RegisterSpace))
- return reportValueError(Ctx, "RegisterSpace", Descriptor.RegisterSpace);
-
- if (RSD.Version > 1) {
- if (!llvm::hlsl::rootsig::verifyRootDescriptorFlag(RSD.Version,
- Descriptor.Flags))
- return reportValueError(Ctx, "RootDescriptorFlag", Descriptor.Flags);
- }
- break;
- }
- case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): {
- const mcdxbc::DescriptorTable &Table =
- RSD.ParametersContainer.getDescriptorTable(Info.Location);
- for (const dxbc::RTS0::v2::DescriptorRange &Range : Table) {
- if (!llvm::hlsl::rootsig::verifyRangeType(Range.RangeType))
- return reportValueError(Ctx, "RangeType", Range.RangeType);
-
- if (!llvm::hlsl::rootsig::verifyRegisterSpace(Range.RegisterSpace))
- return reportValueError(Ctx, "RegisterSpace", Range.RegisterSpace);
-
- if (!llvm::hlsl::rootsig::verifyNumDescriptors(Range.NumDescriptors))
- return reportValueError(Ctx, "NumDescriptors", Range.NumDescriptors);
-
- if (!llvm::hlsl::rootsig::verifyDescriptorRangeFlag(
- RSD.Version, Range.RangeType, Range.Flags))
- return reportValueError(Ctx, "DescriptorFlag", Range.Flags);
- }
- break;
- }
- }
- }
-
- for (const dxbc::RTS0::v1::StaticSampler &Sampler : RSD.StaticSamplers) {
- if (!llvm::hlsl::rootsig::verifySamplerFilter(Sampler.Filter))
- return reportValueError(Ctx, "Filter", Sampler.Filter);
-
- if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressU))
- return reportValueError(Ctx, "AddressU", Sampler.AddressU);
-
- if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressV))
- return reportValueError(Ctx, "AddressV", Sampler.AddressV);
-
- if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressW))
- return reportValueError(Ctx, "AddressW", Sampler.AddressW);
-
- if (!llvm::hlsl::rootsig::verifyMipLODBias(Sampler.MipLODBias))
- return reportValueError(Ctx, "MipLODBias", Sampler.MipLODBias);
-
- if (!llvm::hlsl::rootsig::verifyMaxAnisotropy(Sampler.MaxAnisotropy))
- return reportValueError(Ctx, "MaxAnisotropy", Sampler.MaxAnisotropy);
-
- if (!llvm::hlsl::rootsig::verifyComparisonFunc(Sampler.ComparisonFunc))
- return reportValueError(Ctx, "ComparisonFunc", Sampler.ComparisonFunc);
-
- if (!llvm::hlsl::rootsig::verifyBorderColor(Sampler.BorderColor))
- return reportValueError(Ctx, "BorderColor", Sampler.BorderColor);
-
- if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MinLOD))
- return reportValueError(Ctx, "MinLOD", Sampler.MinLOD);
-
- if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MaxLOD))
- return reportValueError(Ctx, "MaxLOD", Sampler.MaxLOD);
-
- if (!llvm::hlsl::rootsig::verifyRegisterValue(Sampler.ShaderRegister))
- return reportValueError(Ctx, "ShaderRegister", Sampler.ShaderRegister);
-
- if (!llvm::hlsl::rootsig::verifyRegisterSpace(Sampler.RegisterSpace))
- return reportValueError(Ctx, "RegisterSpace", Sampler.RegisterSpace);
-
- if (!dxbc::isValidShaderVisibility(Sampler.ShaderVisibility))
- return reportValueError(Ctx, "ShaderVisibility",
- Sampler.ShaderVisibility);
- }
-
- return false;
+static bool reportError(LLVMContext *Ctx, Twine Message,
+ DiagnosticSeverity Severity = DS_Error) {
+ Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity));
+ return true;
}
static SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc>
@@ -584,7 +127,9 @@ analyzeModule(Module &M) {
// static sampler offset is calculated when writting dxcontainer.
RSD.StaticSamplersOffset = 0u;
- if (parse(Ctx, RSD, RootElementListNode) || validate(Ctx, RSD)) {
+ hlsl::rootsig::MetadataParser MDParser(RootElementListNode);
+
+ if (MDParser.ParseRootSignature(Ctx, RSD)) {
return RSDMap;
}
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h
index fc39b38..254b7ff 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.h
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.h
@@ -26,17 +26,6 @@
namespace llvm {
namespace dxil {
-enum class RootSignatureElementKind {
- Error = 0,
- RootFlags = 1,
- RootConstants = 2,
- SRV = 3,
- UAV = 4,
- CBV = 5,
- DescriptorTable = 6,
- StaticSamplers = 7
-};
-
class RootSignatureBindingInfo {
private:
SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc> FuncToRsMap;
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index eb4adfe..e7e7f2c 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -106,11 +106,11 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
DXILResourceTypeMap &DRTM,
const ModuleMetadataInfo &MMDI) {
if (!CSF.Doubles)
- CSF.Doubles = I.getType()->isDoubleTy();
+ CSF.Doubles = I.getType()->getScalarType()->isDoubleTy();
if (!CSF.Doubles) {
for (const Value *Op : I.operands()) {
- if (Op->getType()->isDoubleTy()) {
+ if (Op->getType()->getScalarType()->isDoubleTy()) {
CSF.Doubles = true;
break;
}
@@ -130,12 +130,13 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
}
if (!CSF.LowPrecisionPresent)
- CSF.LowPrecisionPresent =
- I.getType()->isIntegerTy(16) || I.getType()->isHalfTy();
+ CSF.LowPrecisionPresent = I.getType()->getScalarType()->isIntegerTy(16) ||
+ I.getType()->getScalarType()->isHalfTy();
if (!CSF.LowPrecisionPresent) {
for (const Value *Op : I.operands()) {
- if (Op->getType()->isIntegerTy(16) || Op->getType()->isHalfTy()) {
+ if (Op->getType()->getScalarType()->isIntegerTy(16) ||
+ Op->getType()->getScalarType()->isHalfTy()) {
CSF.LowPrecisionPresent = true;
break;
}
@@ -150,11 +151,11 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
}
if (!CSF.Int64Ops)
- CSF.Int64Ops = I.getType()->isIntegerTy(64);
+ CSF.Int64Ops = I.getType()->getScalarType()->isIntegerTy(64);
if (!CSF.Int64Ops && !isa<LifetimeIntrinsic>(&I)) {
for (const Value *Op : I.operands()) {
- if (Op->getType()->isIntegerTy(64)) {
+ if (Op->getType()->getScalarType()->isIntegerTy(64)) {
CSF.Int64Ops = true;
break;
}
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 46d5d71..1d79c30 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -2545,25 +2545,6 @@ void DXILBitcodeWriter::writeInstruction(const Instruction &I, unsigned InstID,
Vals.clear();
}
-// HLSL Change
-namespace {
-struct ValueNameCreator {
- MallocAllocator Allocator;
- SmallVector<ValueName *, 2>
- ValueNames; // SmallVector N = 2 because we currently only expect this
- // to hold ValueNames for Lifetime intrinsics
- ~ValueNameCreator() {
- for (auto *VN : ValueNames)
- VN->Destroy(Allocator);
- }
- ValueName *create(StringRef Name, Value *V) {
- ValueName *VN = ValueName::create(Name, Allocator, V);
- ValueNames.push_back(VN);
- return VN;
- }
-};
-} // anonymous namespace
-
// Emit names for globals/functions etc.
void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable(
const ValueSymbolTable &VST) {
@@ -2578,24 +2559,9 @@ void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable(
// to ensure the binary is the same no matter what values ever existed.
SmallVector<const ValueName *, 16> SortedTable;
- // HLSL Change
- ValueNameCreator VNC;
for (auto &VI : VST) {
- ValueName *VN = VI.second->getValueName();
- // Clang mangles lifetime intrinsic names by appending '.p0' to the end,
- // making them invalid lifetime intrinsics in LLVM 3.7. We can't
- // demangle in dxil-prepare because it would result in invalid IR.
- // Therefore we have to do this in the bitcode writer while writing its
- // name to the symbol table.
- if (const Function *Fn = dyn_cast<Function>(VI.getValue());
- Fn && Fn->isIntrinsic()) {
- Intrinsic::ID IID = Fn->getIntrinsicID();
- if (IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end)
- VN = VNC.create(Intrinsic::getBaseName(IID), VI.second);
- }
- SortedTable.push_back(VN);
+ SortedTable.push_back(VI.second->getValueName());
}
-
// The keys are unique, so there shouldn't be stability issues.
llvm::sort(SortedTable, [](const ValueName *A, const ValueName *B) {
return A->first() < B->first();
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
index dfc79039c..1bd5dd7 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
@@ -17,6 +17,7 @@
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/InitializePasses.h"
@@ -52,6 +53,53 @@ public:
}
};
+static void legalizeLifetimeIntrinsics(Module &M) {
+ for (Function &F : M) {
+ Intrinsic::ID IID = F.getIntrinsicID();
+ if (IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end)
+ continue;
+
+ // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr
+ F.removeFnAttr(Attribute::Memory);
+
+ // Lifetime intrinsics in LLVM 3.7 do not have mangled names
+ F.setName(Intrinsic::getBaseName(IID));
+
+ // LLVM 3.7 Lifetime intrinics require an i8* operand, so we insert bitcasts
+ // to ensure that is the case
+ for (auto *User : make_early_inc_range(F.users())) {
+ CallInst *CI = dyn_cast<CallInst>(User);
+ assert(CI && "Expected user of a lifetime intrinsic function to be a "
+ "lifetime intrinsic call");
+ Value *PtrOperand = CI->getArgOperand(1);
+ PointerType *PtrTy = cast<PointerType>(PtrOperand->getType());
+ Value *NoOpBitCast = CastInst::Create(Instruction::BitCast, PtrOperand,
+ PtrTy, "", CI->getIterator());
+ CI->setArgOperand(1, NoOpBitCast);
+ }
+ }
+}
+
+static void removeLifetimeIntrinsics(Module &M) {
+ for (Function &F : make_early_inc_range(M)) {
+ if (Intrinsic::ID IID = F.getIntrinsicID();
+ IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end)
+ continue;
+
+ for (User *U : make_early_inc_range(F.users())) {
+ LifetimeIntrinsic *LI = dyn_cast<LifetimeIntrinsic>(U);
+ assert(LI && "Expected user of lifetime intrinsic function to be "
+ "a LifetimeIntrinsic instruction");
+ BitCastInst *BCI = dyn_cast<BitCastInst>(LI->getArgOperand(1));
+ assert(BCI && "Expected pointer operand of LifetimeIntrinsic to be a "
+ "BitCastInst");
+ LI->eraseFromParent();
+ BCI->eraseFromParent();
+ }
+ F.eraseFromParent();
+ }
+}
+
class EmbedDXILPass : public llvm::ModulePass {
public:
static char ID; // Pass identification, replacement for typeid
@@ -70,8 +118,17 @@ public:
// Only the output bitcode need to be DXIL triple.
M.setTargetTriple(Triple("dxil-ms-dx"));
+ // Perform late legalization of lifetime intrinsics that would otherwise
+ // fail the Module Verifier if performed in an earlier pass
+ legalizeLifetimeIntrinsics(M);
+
WriteDXILToFile(M, OS);
+ // We no longer need lifetime intrinsics after bitcode serialization, so we
+ // simply remove them to keep the Module Verifier happy after our
+ // not-so-legal legalizations
+ removeLifetimeIntrinsics(M);
+
// Recover triple.
M.setTargetTriple(OriginalTriple);
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
index f0ca908..6050649 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -336,5 +336,4 @@ class InstDuplex<bits<4> iClass, string cstr = ""> : Instruction,
// Instruction Classes Definitions -
//===----------------------------------------------------------------------===//
-include "HexagonInstrFormatsV60.td"
include "HexagonInstrFormatsV65.td"
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
deleted file mode 100644
index 86a8218..0000000
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
+++ /dev/null
@@ -1,21 +0,0 @@
-//==- HexagonInstrFormatsV60.td - Hexagon Instruction Formats -*- tablegen -==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V60 instruction classes in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//----------------------------------------------------------------------------//
-// Instruction Classes Definitions +
-//----------------------------------------------------------------------------//
-
-class CVI_VA_Resource<dag outs, dag ins, string asmstr,
- list<dag> pattern = [], string cstr = "",
- InstrItinClass itin = CVI_VA>
- : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
- OpcodeHexagon, Requires<[HasV60, UseHVX]>;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td
index 246a1d3..85b826f 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td
+++ b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td
@@ -20,11 +20,6 @@
// Instruction Classes Definitions +
//----------------------------------------------------------------------------//
-class CVI_VA_Resource_NoOpcode<dag outs, dag ins, string asmstr,
- list<dag> pattern = [], string cstr = "",
- InstrItinClass itin = CVI_VA>
- : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>;
-
class CVI_GATHER_TMP_LD_Resource_NoOpcode<dag outs, dag ins, string asmstr,
list<dag> pattern = [], string cstr = "",
InstrItinClass itin = CVI_GATHER_PSEUDO>
diff --git a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
deleted file mode 100644
index 44f39a3..0000000
--- a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
+++ /dev/null
@@ -1,414 +0,0 @@
-//===- HexagonIntrinsicsV5.td - V5 Instruction intrinsics --*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-def : T_PR_pat <M2_vrcmpys_s1, int_hexagon_M2_vrcmpys_s1>;
-def : T_PPR_pat<M2_vrcmpys_acc_s1, int_hexagon_M2_vrcmpys_acc_s1>;
-def : T_PR_pat <M2_vrcmpys_s1rp, int_hexagon_M2_vrcmpys_s1rp>;
-
-// Vector reduce add unsigned halfwords
-def : T_PP_pat<M2_vradduh, int_hexagon_M2_vradduh>;
-
-def: T_RP_pat<A2_addsp, int_hexagon_A2_addsp>;
-def: T_PP_pat<A2_addpsat, int_hexagon_A2_addpsat>;
-def: T_PP_pat<A2_minp, int_hexagon_A2_minp>;
-def: T_PP_pat<A2_minup, int_hexagon_A2_minup>;
-def: T_PP_pat<A2_maxp, int_hexagon_A2_maxp>;
-def: T_PP_pat<A2_maxup, int_hexagon_A2_maxup>;
-
-// Vector reduce multiply word by signed half (32x16)
-//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
-def : T_PP_pat <M4_vrmpyeh_s0, int_hexagon_M4_vrmpyeh_s0>;
-def : T_PP_pat <M4_vrmpyeh_s1, int_hexagon_M4_vrmpyeh_s1>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def : T_PP_pat <M4_vrmpyoh_s0, int_hexagon_M4_vrmpyoh_s0>;
-def : T_PP_pat <M4_vrmpyoh_s1, int_hexagon_M4_vrmpyoh_s1>;
-
-//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
-def : T_PPP_pat <M4_vrmpyeh_acc_s0, int_hexagon_M4_vrmpyeh_acc_s0>;
-def : T_PPP_pat <M4_vrmpyeh_acc_s1, int_hexagon_M4_vrmpyeh_acc_s1>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def : T_PPP_pat <M4_vrmpyoh_acc_s0, int_hexagon_M4_vrmpyoh_acc_s0>;
-def : T_PPP_pat <M4_vrmpyoh_acc_s1, int_hexagon_M4_vrmpyoh_acc_s1>;
-
-// Vector multiply halfwords, signed by unsigned
-// Rdd=vmpyhsu(Rs,Rt)[:<<1]:sat
-def : T_RR_pat <M2_vmpy2su_s0, int_hexagon_M2_vmpy2su_s0>;
-def : T_RR_pat <M2_vmpy2su_s1, int_hexagon_M2_vmpy2su_s1>;
-
-// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
-def : T_PRR_pat <M2_vmac2su_s0, int_hexagon_M2_vmac2su_s0>;
-def : T_PRR_pat <M2_vmac2su_s1, int_hexagon_M2_vmac2su_s1>;
-
-// Vector polynomial multiply halfwords
-// Rdd=vpmpyh(Rs,Rt)
-def : T_RR_pat <M4_vpmpyh, int_hexagon_M4_vpmpyh>;
-// Rxx[^]=vpmpyh(Rs,Rt)
-def : T_PRR_pat <M4_vpmpyh_acc, int_hexagon_M4_vpmpyh_acc>;
-
-// Polynomial multiply words
-// Rdd=pmpyw(Rs,Rt)
-def : T_RR_pat <M4_pmpyw, int_hexagon_M4_pmpyw>;
-// Rxx^=pmpyw(Rs,Rt)
-def : T_PRR_pat <M4_pmpyw_acc, int_hexagon_M4_pmpyw_acc>;
-
-//Rxx^=asr(Rss,Rt)
-def : T_PPR_pat <S2_asr_r_p_xor, int_hexagon_S2_asr_r_p_xor>;
-//Rxx^=asl(Rss,Rt)
-def : T_PPR_pat <S2_asl_r_p_xor, int_hexagon_S2_asl_r_p_xor>;
-//Rxx^=lsr(Rss,Rt)
-def : T_PPR_pat <S2_lsr_r_p_xor, int_hexagon_S2_lsr_r_p_xor>;
-//Rxx^=lsl(Rss,Rt)
-def : T_PPR_pat <S2_lsl_r_p_xor, int_hexagon_S2_lsl_r_p_xor>;
-
-// Multiply and use upper result
-def : T_RR_pat <M2_mpysu_up, int_hexagon_M2_mpysu_up>;
-def : T_RR_pat <M2_mpy_up_s1, int_hexagon_M2_mpy_up_s1>;
-def : T_RR_pat <M2_hmmpyh_s1, int_hexagon_M2_hmmpyh_s1>;
-def : T_RR_pat <M2_hmmpyl_s1, int_hexagon_M2_hmmpyl_s1>;
-def : T_RR_pat <M2_mpy_up_s1_sat, int_hexagon_M2_mpy_up_s1_sat>;
-
-def : T_PP_pat <A2_vaddub, int_hexagon_A2_vaddb_map>;
-def : T_PP_pat <A2_vsubub, int_hexagon_A2_vsubb_map>;
-
-// Vector reduce add unsigned halfwords
-def : T_PP_pat <M2_vraddh, int_hexagon_M2_vraddh>;
-
-def: T_P_pat<S2_brevp, int_hexagon_S2_brevp>;
-def: T_P_pat<S2_ct0p, int_hexagon_S2_ct0p>;
-def: T_P_pat<S2_ct1p, int_hexagon_S2_ct1p>;
-
-def: T_Q_RR_pat<C4_nbitsset, int_hexagon_C4_nbitsset>;
-def: T_Q_RR_pat<C4_nbitsclr, int_hexagon_C4_nbitsclr>;
-def: T_Q_RI_pat<C4_nbitsclri, int_hexagon_C4_nbitsclri>;
-
-def : T_Q_PI_pat<A4_vcmpbeqi, int_hexagon_A4_vcmpbeqi>;
-def : T_Q_PI_pat<A4_vcmpbgti, int_hexagon_A4_vcmpbgti>;
-def : T_Q_PI_pat<A4_vcmpbgtui, int_hexagon_A4_vcmpbgtui>;
-def : T_Q_PI_pat<A4_vcmpheqi, int_hexagon_A4_vcmpheqi>;
-def : T_Q_PI_pat<A4_vcmphgti, int_hexagon_A4_vcmphgti>;
-def : T_Q_PI_pat<A4_vcmphgtui, int_hexagon_A4_vcmphgtui>;
-def : T_Q_PI_pat<A4_vcmpweqi, int_hexagon_A4_vcmpweqi>;
-def : T_Q_PI_pat<A4_vcmpwgti, int_hexagon_A4_vcmpwgti>;
-def : T_Q_PI_pat<A4_vcmpwgtui, int_hexagon_A4_vcmpwgtui>;
-def : T_Q_PP_pat<A4_vcmpbeq_any, int_hexagon_A4_vcmpbeq_any>;
-
-def : T_Q_RR_pat<A4_cmpbeq, int_hexagon_A4_cmpbeq>;
-def : T_Q_RR_pat<A4_cmpbgt, int_hexagon_A4_cmpbgt>;
-def : T_Q_RR_pat<A4_cmpbgtu, int_hexagon_A4_cmpbgtu>;
-def : T_Q_RR_pat<A4_cmpheq, int_hexagon_A4_cmpheq>;
-def : T_Q_RR_pat<A4_cmphgt, int_hexagon_A4_cmphgt>;
-def : T_Q_RR_pat<A4_cmphgtu, int_hexagon_A4_cmphgtu>;
-
-def : T_Q_RI_pat<A4_cmpbeqi, int_hexagon_A4_cmpbeqi>;
-def : T_Q_RI_pat<A4_cmpbgti, int_hexagon_A4_cmpbgti>;
-def : T_Q_RI_pat<A4_cmpbgtui, int_hexagon_A4_cmpbgtui>;
-
-def : T_Q_RI_pat<A4_cmpheqi, int_hexagon_A4_cmpheqi>;
-def : T_Q_RI_pat<A4_cmphgti, int_hexagon_A4_cmphgti>;
-def : T_Q_RI_pat<A4_cmphgtui, int_hexagon_A4_cmphgtui>;
-
-def : T_Q_RP_pat<A4_boundscheck, int_hexagon_A4_boundscheck>;
-def : T_Q_PR_pat<A4_tlbmatch, int_hexagon_A4_tlbmatch>;
-
-def : T_RRR_pat <M4_mpyrr_addr, int_hexagon_M4_mpyrr_addr>;
-def : T_IRR_pat <M4_mpyrr_addi, int_hexagon_M4_mpyrr_addi>;
-def : T_IRI_pat <M4_mpyri_addi, int_hexagon_M4_mpyri_addi>;
-def : T_RIR_pat <M4_mpyri_addr_u2, int_hexagon_M4_mpyri_addr_u2>;
-def : T_RRI_pat <M4_mpyri_addr, int_hexagon_M4_mpyri_addr>;
-def : T_RRR_pat <M4_mac_up_s1_sat, int_hexagon_M4_mac_up_s1_sat>;
-def : T_RRR_pat <M4_nac_up_s1_sat, int_hexagon_M4_nac_up_s1_sat>;
-
-// Complex multiply 32x16
-def : T_PR_pat <M4_cmpyi_wh, int_hexagon_M4_cmpyi_wh>;
-def : T_PR_pat <M4_cmpyr_wh, int_hexagon_M4_cmpyr_wh>;
-
-def : T_PR_pat <M4_cmpyi_whc, int_hexagon_M4_cmpyi_whc>;
-def : T_PR_pat <M4_cmpyr_whc, int_hexagon_M4_cmpyr_whc>;
-
-def : T_PP_pat<A4_andnp, int_hexagon_A4_andnp>;
-def : T_PP_pat<A4_ornp, int_hexagon_A4_ornp>;
-
-// Complex add/sub halfwords/words
-def : T_PP_pat <S4_vxaddsubw, int_hexagon_S4_vxaddsubw>;
-def : T_PP_pat <S4_vxsubaddw, int_hexagon_S4_vxsubaddw>;
-def : T_PP_pat <S4_vxaddsubh, int_hexagon_S4_vxaddsubh>;
-def : T_PP_pat <S4_vxsubaddh, int_hexagon_S4_vxsubaddh>;
-
-def : T_PP_pat <S4_vxaddsubhr, int_hexagon_S4_vxaddsubhr>;
-def : T_PP_pat <S4_vxsubaddhr, int_hexagon_S4_vxsubaddhr>;
-
-// Extract bitfield
-def : T_PP_pat <S4_extractp_rp, int_hexagon_S4_extractp_rp>;
-def : T_RP_pat <S4_extract_rp, int_hexagon_S4_extract_rp>;
-def : T_PII_pat <S4_extractp, int_hexagon_S4_extractp>;
-def : T_RII_pat <S4_extract, int_hexagon_S4_extract>;
-
-// Vector conditional negate
-// Rdd=vcnegh(Rss,Rt)
-def : T_PR_pat <S2_vcnegh, int_hexagon_S2_vcnegh>;
-
-// Shift an immediate left by register amount
-def : T_IR_pat<S4_lsli, int_hexagon_S4_lsli>;
-
-// Vector reduce maximum halfwords
-def : T_PPR_pat <A4_vrmaxh, int_hexagon_A4_vrmaxh>;
-def : T_PPR_pat <A4_vrmaxuh, int_hexagon_A4_vrmaxuh>;
-
-// Vector reduce maximum words
-def : T_PPR_pat <A4_vrmaxw, int_hexagon_A4_vrmaxw>;
-def : T_PPR_pat <A4_vrmaxuw, int_hexagon_A4_vrmaxuw>;
-
-// Vector reduce minimum halfwords
-def : T_PPR_pat <A4_vrminh, int_hexagon_A4_vrminh>;
-def : T_PPR_pat <A4_vrminuh, int_hexagon_A4_vrminuh>;
-
-// Vector reduce minimum words
-def : T_PPR_pat <A4_vrminw, int_hexagon_A4_vrminw>;
-def : T_PPR_pat <A4_vrminuw, int_hexagon_A4_vrminuw>;
-
-// Rotate and reduce bytes
-def : Pat <(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2,
- u2_0ImmPred:$src3),
- (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>;
-
-// Rotate and reduce bytes with accumulation
-// Rxx+=vrcrotate(Rss,Rt,#u2)
-def : Pat <(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
- IntRegs:$src3, u2_0ImmPred:$src4),
- (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
- IntRegs:$src3, u2_0ImmPred:$src4)>;
-
-// Vector conditional negate
-def : T_PPR_pat<S2_vrcnegh, int_hexagon_S2_vrcnegh>;
-
-// Logical xor with xor accumulation
-def : T_PPP_pat<M4_xor_xacc, int_hexagon_M4_xor_xacc>;
-
-// ALU64 - Vector min/max byte
-def : T_PP_pat <A2_vminb, int_hexagon_A2_vminb>;
-def : T_PP_pat <A2_vmaxb, int_hexagon_A2_vmaxb>;
-
-// Shift and add/sub/and/or
-def : T_IRI_pat <S4_andi_asl_ri, int_hexagon_S4_andi_asl_ri>;
-def : T_IRI_pat <S4_ori_asl_ri, int_hexagon_S4_ori_asl_ri>;
-def : T_IRI_pat <S4_addi_asl_ri, int_hexagon_S4_addi_asl_ri>;
-def : T_IRI_pat <S4_subi_asl_ri, int_hexagon_S4_subi_asl_ri>;
-def : T_IRI_pat <S4_andi_lsr_ri, int_hexagon_S4_andi_lsr_ri>;
-def : T_IRI_pat <S4_ori_lsr_ri, int_hexagon_S4_ori_lsr_ri>;
-def : T_IRI_pat <S4_addi_lsr_ri, int_hexagon_S4_addi_lsr_ri>;
-def : T_IRI_pat <S4_subi_lsr_ri, int_hexagon_S4_subi_lsr_ri>;
-
-// Split bitfield
-def : T_RI_pat <A4_bitspliti, int_hexagon_A4_bitspliti>;
-def : T_RR_pat <A4_bitsplit, int_hexagon_A4_bitsplit>;
-
-def: T_RR_pat<S4_parity, int_hexagon_S4_parity>;
-
-def: T_Q_RI_pat<S4_ntstbit_i, int_hexagon_S4_ntstbit_i>;
-def: T_Q_RR_pat<S4_ntstbit_r, int_hexagon_S4_ntstbit_r>;
-
-def: T_RI_pat<S4_clbaddi, int_hexagon_S4_clbaddi>;
-def: T_PI_pat<S4_clbpaddi, int_hexagon_S4_clbpaddi>;
-def: T_P_pat <S4_clbpnorm, int_hexagon_S4_clbpnorm>;
-
-//*******************************************************************
-// ALU32/ALU
-//*******************************************************************
-
-// ALU32 / ALU / Logical Operations.
-def: T_RR_pat<A4_andn, int_hexagon_A4_andn>;
-def: T_RR_pat<A4_orn, int_hexagon_A4_orn>;
-
-//*******************************************************************
-// ALU32/PERM
-//*******************************************************************
-
-// Combine Words Into Doublewords.
-def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s32_0ImmPred>;
-def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s32_0ImmPred>;
-
-//*******************************************************************
-// ALU32/PRED
-//*******************************************************************
-
-// Compare
-def : T_Q_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s32_0ImmPred>;
-def : T_Q_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s32_0ImmPred>;
-def : T_Q_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u32_0ImmPred>;
-
-// Compare To General Register.
-def: T_Q_RR_pat<C4_cmpneq, int_hexagon_C4_cmpneq>;
-def: T_Q_RR_pat<C4_cmplte, int_hexagon_C4_cmplte>;
-def: T_Q_RR_pat<C4_cmplteu, int_hexagon_C4_cmplteu>;
-
-def: T_RR_pat<A4_rcmpeq, int_hexagon_A4_rcmpeq>;
-def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>;
-
-def: T_RI_pat<A4_rcmpeqi, int_hexagon_A4_rcmpeqi>;
-def: T_RI_pat<A4_rcmpneqi, int_hexagon_A4_rcmpneqi>;
-
-//*******************************************************************
-// CR
-//*******************************************************************
-
-// CR / Logical Operations On Predicates.
-def: T_Q_QQQ_pat<C4_and_and, int_hexagon_C4_and_and>;
-def: T_Q_QQQ_pat<C4_and_andn, int_hexagon_C4_and_andn>;
-def: T_Q_QQQ_pat<C4_and_or, int_hexagon_C4_and_or>;
-def: T_Q_QQQ_pat<C4_and_orn, int_hexagon_C4_and_orn>;
-def: T_Q_QQQ_pat<C4_or_and, int_hexagon_C4_or_and>;
-def: T_Q_QQQ_pat<C4_or_andn, int_hexagon_C4_or_andn>;
-def: T_Q_QQQ_pat<C4_or_or, int_hexagon_C4_or_or>;
-def: T_Q_QQQ_pat<C4_or_orn, int_hexagon_C4_or_orn>;
-
-//*******************************************************************
-// XTYPE/ALU
-//*******************************************************************
-
-// Add And Accumulate.
-
-def : T_RRI_pat <S4_addaddi, int_hexagon_S4_addaddi>;
-def : T_RIR_pat <S4_subaddi, int_hexagon_S4_subaddi>;
-
-
-// XTYPE / ALU / Logical-logical Words.
-def : T_RRR_pat <M4_or_xor, int_hexagon_M4_or_xor>;
-def : T_RRR_pat <M4_and_xor, int_hexagon_M4_and_xor>;
-def : T_RRR_pat <M4_or_and, int_hexagon_M4_or_and>;
-def : T_RRR_pat <M4_and_and, int_hexagon_M4_and_and>;
-def : T_RRR_pat <M4_xor_and, int_hexagon_M4_xor_and>;
-def : T_RRR_pat <M4_or_or, int_hexagon_M4_or_or>;
-def : T_RRR_pat <M4_and_or, int_hexagon_M4_and_or>;
-def : T_RRR_pat <M4_xor_or, int_hexagon_M4_xor_or>;
-def : T_RRR_pat <M4_or_andn, int_hexagon_M4_or_andn>;
-def : T_RRR_pat <M4_and_andn, int_hexagon_M4_and_andn>;
-def : T_RRR_pat <M4_xor_andn, int_hexagon_M4_xor_andn>;
-
-def : T_RRI_pat <S4_or_andi, int_hexagon_S4_or_andi>;
-def : T_RRI_pat <S4_or_andix, int_hexagon_S4_or_andix>;
-def : T_RRI_pat <S4_or_ori, int_hexagon_S4_or_ori>;
-
-// Modulo wrap.
-def : T_RR_pat <A4_modwrapu, int_hexagon_A4_modwrapu>;
-
-// Arithmetic/Convergent round
-// Rd=[cround|round](Rs,Rt)[:sat]
-// Rd=[cround|round](Rs,#u5)[:sat]
-def : T_RI_pat <A4_cround_ri, int_hexagon_A4_cround_ri>;
-def : T_RR_pat <A4_cround_rr, int_hexagon_A4_cround_rr>;
-
-def : T_RI_pat <A4_round_ri, int_hexagon_A4_round_ri>;
-def : T_RR_pat <A4_round_rr, int_hexagon_A4_round_rr>;
-
-def : T_RI_pat <A4_round_ri_sat, int_hexagon_A4_round_ri_sat>;
-def : T_RR_pat <A4_round_rr_sat, int_hexagon_A4_round_rr_sat>;
-
-def : T_P_pat <A2_roundsat, int_hexagon_A2_roundsat>;
-
-//Rdd[+]=vrmpybsu(Rss,Rtt)
-//Rdd[+]=vrmpybuu(Rss,Rtt)
-def : T_PP_pat <M5_vrmpybsu, int_hexagon_M5_vrmpybsu>;
-def : T_PP_pat <M5_vrmpybuu, int_hexagon_M5_vrmpybuu>;
-
-def : T_PP_pat <M5_vdmpybsu, int_hexagon_M5_vdmpybsu>;
-
-def : T_PPP_pat <M5_vrmacbsu, int_hexagon_M5_vrmacbsu>;
-def : T_PPP_pat <M5_vrmacbuu, int_hexagon_M5_vrmacbuu>;
-//Rxx+=vdmpybsu(Rss,Rtt):sat
-def : T_PPP_pat <M5_vdmacbsu, int_hexagon_M5_vdmacbsu>;
-
-// Vector multiply bytes
-// Rdd=vmpyb[s]u(Rs,Rt)
-def : T_RR_pat <M5_vmpybsu, int_hexagon_M5_vmpybsu>;
-def : T_RR_pat <M5_vmpybuu, int_hexagon_M5_vmpybuu>;
-
-// Rxx+=vmpyb[s]u(Rs,Rt)
-def : T_PRR_pat <M5_vmacbsu, int_hexagon_M5_vmacbsu>;
-def : T_PRR_pat <M5_vmacbuu, int_hexagon_M5_vmacbuu>;
-
-// Rd=vaddhub(Rss,Rtt):sat
-def : T_PP_pat <A5_vaddhubs, int_hexagon_A5_vaddhubs>;
-
-def : T_FF_pat<F2_sfadd, int_hexagon_F2_sfadd>;
-def : T_FF_pat<F2_sfsub, int_hexagon_F2_sfsub>;
-def : T_FF_pat<F2_sfmpy, int_hexagon_F2_sfmpy>;
-def : T_FF_pat<F2_sfmax, int_hexagon_F2_sfmax>;
-def : T_FF_pat<F2_sfmin, int_hexagon_F2_sfmin>;
-
-def : T_FF_pat<F2_sffixupn, int_hexagon_F2_sffixupn>;
-def : T_FF_pat<F2_sffixupd, int_hexagon_F2_sffixupd>;
-def : T_F_pat <F2_sffixupr, int_hexagon_F2_sffixupr>;
-
-def : T_Q_QQ_pat<C4_fastcorner9, int_hexagon_C4_fastcorner9>;
-def : T_Q_QQ_pat<C4_fastcorner9_not, int_hexagon_C4_fastcorner9_not>;
-
-def : T_P_pat <S5_popcountp, int_hexagon_S5_popcountp>;
-def : T_PI_pat <S5_asrhub_sat, int_hexagon_S5_asrhub_sat>;
-
-def : T_PI_pat <S2_asr_i_p_rnd, int_hexagon_S2_asr_i_p_rnd>;
-def : T_PI_pat <S2_asr_i_p_rnd_goodsyntax,
- int_hexagon_S2_asr_i_p_rnd_goodsyntax>;
-
-def : T_PI_pat <S5_asrhub_rnd_sat_goodsyntax,
- int_hexagon_S5_asrhub_rnd_sat_goodsyntax>;
-
-def : T_PI_pat <S5_vasrhrnd_goodsyntax, int_hexagon_S5_vasrhrnd_goodsyntax>;
-
-def : T_FFF_pat <F2_sffma, int_hexagon_F2_sffma>;
-def : T_FFF_pat <F2_sffms, int_hexagon_F2_sffms>;
-def : T_FFF_pat <F2_sffma_lib, int_hexagon_F2_sffma_lib>;
-def : T_FFF_pat <F2_sffms_lib, int_hexagon_F2_sffms_lib>;
-def : T_FFFQ_pat <F2_sffma_sc, int_hexagon_F2_sffma_sc>;
-
-// Compare floating-point value
-def : T_Q_FF_pat <F2_sfcmpge, int_hexagon_F2_sfcmpge>;
-def : T_Q_FF_pat <F2_sfcmpuo, int_hexagon_F2_sfcmpuo>;
-def : T_Q_FF_pat <F2_sfcmpeq, int_hexagon_F2_sfcmpeq>;
-def : T_Q_FF_pat <F2_sfcmpgt, int_hexagon_F2_sfcmpgt>;
-
-def : T_Q_DD_pat <F2_dfcmpeq, int_hexagon_F2_dfcmpeq>;
-def : T_Q_DD_pat <F2_dfcmpgt, int_hexagon_F2_dfcmpgt>;
-def : T_Q_DD_pat <F2_dfcmpge, int_hexagon_F2_dfcmpge>;
-def : T_Q_DD_pat <F2_dfcmpuo, int_hexagon_F2_dfcmpuo>;
-
-// Create floating-point value
-def : T_I_pat <F2_sfimm_p, int_hexagon_F2_sfimm_p>;
-def : T_I_pat <F2_sfimm_n, int_hexagon_F2_sfimm_n>;
-def : T_I_pat <F2_dfimm_p, int_hexagon_F2_dfimm_p>;
-def : T_I_pat <F2_dfimm_n, int_hexagon_F2_dfimm_n>;
-
-def : T_Q_DI_pat <F2_dfclass, int_hexagon_F2_dfclass>;
-def : T_Q_FI_pat <F2_sfclass, int_hexagon_F2_sfclass>;
-def : T_F_pat <F2_conv_sf2df, int_hexagon_F2_conv_sf2df>;
-def : T_D_pat <F2_conv_df2sf, int_hexagon_F2_conv_df2sf>;
-def : T_R_pat <F2_conv_uw2sf, int_hexagon_F2_conv_uw2sf>;
-def : T_R_pat <F2_conv_uw2df, int_hexagon_F2_conv_uw2df>;
-def : T_R_pat <F2_conv_w2sf, int_hexagon_F2_conv_w2sf>;
-def : T_R_pat <F2_conv_w2df, int_hexagon_F2_conv_w2df>;
-def : T_P_pat <F2_conv_ud2sf, int_hexagon_F2_conv_ud2sf>;
-def : T_P_pat <F2_conv_ud2df, int_hexagon_F2_conv_ud2df>;
-def : T_P_pat <F2_conv_d2sf, int_hexagon_F2_conv_d2sf>;
-def : T_P_pat <F2_conv_d2df, int_hexagon_F2_conv_d2df>;
-def : T_F_pat <F2_conv_sf2uw, int_hexagon_F2_conv_sf2uw>;
-def : T_F_pat <F2_conv_sf2w, int_hexagon_F2_conv_sf2w>;
-def : T_F_pat <F2_conv_sf2ud, int_hexagon_F2_conv_sf2ud>;
-def : T_F_pat <F2_conv_sf2d, int_hexagon_F2_conv_sf2d>;
-def : T_D_pat <F2_conv_df2uw, int_hexagon_F2_conv_df2uw>;
-def : T_D_pat <F2_conv_df2w, int_hexagon_F2_conv_df2w>;
-def : T_D_pat <F2_conv_df2ud, int_hexagon_F2_conv_df2ud>;
-def : T_D_pat <F2_conv_df2d, int_hexagon_F2_conv_df2d>;
-def : T_F_pat <F2_conv_sf2uw_chop, int_hexagon_F2_conv_sf2uw_chop>;
-def : T_F_pat <F2_conv_sf2w_chop, int_hexagon_F2_conv_sf2w_chop>;
-def : T_F_pat <F2_conv_sf2ud_chop, int_hexagon_F2_conv_sf2ud_chop>;
-def : T_F_pat <F2_conv_sf2d_chop, int_hexagon_F2_conv_sf2d_chop>;
-def : T_D_pat <F2_conv_df2uw_chop, int_hexagon_F2_conv_df2uw_chop>;
-def : T_D_pat <F2_conv_df2w_chop, int_hexagon_F2_conv_df2w_chop>;
-def : T_D_pat <F2_conv_df2ud_chop, int_hexagon_F2_conv_df2ud_chop>;
-def : T_D_pat <F2_conv_df2d_chop, int_hexagon_F2_conv_df2d_chop>;
diff --git a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
deleted file mode 100644
index 796979e..0000000
--- a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
+++ /dev/null
@@ -1,642 +0,0 @@
-//===- HexagonIntrinsicsV60.td - V60 instruction intrinsics -*- tablegen *-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V60 Compiler Intrinsics in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-
-let AddedComplexity = 100 in {
-def : Pat < (v16i32 (int_hexagon_V6_lo (v32i32 HvxWR:$src1))),
- (v16i32 (EXTRACT_SUBREG (v32i32 HvxWR:$src1), vsub_lo)) >;
-
-def : Pat < (v16i32 (int_hexagon_V6_hi (v32i32 HvxWR:$src1))),
- (v16i32 (EXTRACT_SUBREG (v32i32 HvxWR:$src1), vsub_hi)) >;
-
-def : Pat < (v32i32 (int_hexagon_V6_lo_128B (v64i32 HvxWR:$src1))),
- (v32i32 (EXTRACT_SUBREG (v64i32 HvxWR:$src1), vsub_lo)) >;
-
-def : Pat < (v32i32 (int_hexagon_V6_hi_128B (v64i32 HvxWR:$src1))),
- (v32i32 (EXTRACT_SUBREG (v64i32 HvxWR:$src1), vsub_hi)) >;
-}
-
-def : Pat <(v64i1 (bitconvert (v16i32 HvxVR:$src1))),
- (v64i1 (V6_vandvrt(v16i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v64i1 (bitconvert (v32i16 HvxVR:$src1))),
- (v64i1 (V6_vandvrt(v32i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v64i1 (bitconvert (v64i8 HvxVR:$src1))),
- (v64i1 (V6_vandvrt(v64i8 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v16i32 (bitconvert (v64i1 HvxQR:$src1))),
- (v16i32 (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v32i16 (bitconvert (v64i1 HvxQR:$src1))),
- (v32i16 (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v64i8 (bitconvert (v64i1 HvxQR:$src1))),
- (v64i8 (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v128i1 (bitconvert (v32i32 HvxVR:$src1))),
- (v128i1 (V6_vandvrt (v32i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v128i1 (bitconvert (v64i16 HvxVR:$src1))),
- (v128i1 (V6_vandvrt (v64i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v128i1 (bitconvert (v128i8 HvxVR:$src1))),
- (v128i1 (V6_vandvrt (v128i8 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v32i32 (bitconvert (v128i1 HvxQR:$src1))),
- (v32i32 (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v64i16 (bitconvert (v128i1 HvxQR:$src1))),
- (v64i16 (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v128i8 (bitconvert (v128i1 HvxQR:$src1))),
- (v128i8 (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-let AddedComplexity = 140 in {
-def : Pat <(store (v64i1 HvxQR:$src1), (i32 IntRegs:$addr)),
- (V6_vS32b_ai IntRegs:$addr, 0,
- (v16i32 (V6_vandqrt (v64i1 HvxQR:$src1),
- (A2_tfrsi 0x01010101))))>;
-
-def : Pat <(v64i1 (load (i32 IntRegs:$addr))),
- (v64i1 (V6_vandvrt
- (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(store (v128i1 HvxQR:$src1), (i32 IntRegs:$addr)),
- (V6_vS32b_ai IntRegs:$addr, 0,
- (v32i32 (V6_vandqrt (v128i1 HvxQR:$src1),
- (A2_tfrsi 0x01010101))))>;
-
-def : Pat <(v128i1 (load (i32 IntRegs:$addr))),
- (v128i1 (V6_vandvrt
- (v32i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>;
-}
-
-multiclass T_R_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID IntRegs:$src1), (MI IntRegs:$src1)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1),
- (MI IntRegs:$src1)>;
-}
-
-multiclass T_V_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1),
- (MI HvxVR:$src1)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1),
- (MI HvxVR:$src1)>;
-}
-
-multiclass T_W_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1),
- (MI HvxWR:$src1)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1),
- (MI HvxWR:$src1)>;
-}
-
-multiclass T_Q_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1),
- (MI HvxQR:$src1)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1),
- (MI HvxQR:$src1)>;
-}
-
-multiclass T_WR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, IntRegs:$src2),
- (MI HvxWR:$src1, IntRegs:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B")HvxWR:$src1, IntRegs:$src2),
- (MI HvxWR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_VR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, IntRegs:$src2),
- (MI HvxVR:$src1, IntRegs:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B")HvxVR:$src1, IntRegs:$src2),
- (MI HvxVR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_WV_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxVR:$src2),
- (MI HvxWR:$src1, HvxVR:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2),
- (MI HvxWR:$src1, HvxVR:$src2)>;
-}
-
-multiclass T_WW_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxWR:$src2),
- (MI HvxWR:$src1, HvxWR:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2),
- (MI HvxWR:$src1, HvxWR:$src2)>;
-}
-
-multiclass T_VV_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2),
- (MI HvxVR:$src1, HvxVR:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2),
- (MI HvxVR:$src1, HvxVR:$src2)>;
-}
-
-multiclass T_QR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, IntRegs:$src2),
- (MI HvxQR:$src1, IntRegs:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, IntRegs:$src2),
- (MI HvxQR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_QQ_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, HvxQR:$src2),
- (MI HvxQR:$src1, HvxQR:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxQR:$src2),
- (MI HvxQR:$src1, HvxQR:$src2)>;
-}
-
-multiclass T_WWR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2,
- IntRegs:$src3),
- (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VVR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- IntRegs:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_WVR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
- (MI HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
- IntRegs:$src3),
- (MI HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VWR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
- (MI HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxWR:$src2,
- IntRegs:$src3),
- (MI HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VVV_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- HvxVR:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-}
-
-multiclass T_WVV_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
- HvxVR:$src3),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-}
-
-multiclass T_QVV_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (MI HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxVR:$src2,
- HvxVR:$src3),
- (MI HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-}
-
-multiclass T_VQR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
- (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxQR:$src2,
- IntRegs:$src3),
- (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>;
-}
-
-
-multiclass T_QVR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, HvxVR:$src2, IntRegs:$src3),
- (MI HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxVR:$src2,
- IntRegs:$src3),
- (MI HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VVI_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, imm:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1,
- HvxVR:$src2, imm:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>;
-}
-
-multiclass T_WRI_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, IntRegs:$src2, imm:$src3),
- (MI HvxWR:$src1, IntRegs:$src2, imm:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1,
- IntRegs:$src2, imm:$src3),
- (MI HvxWR:$src1, IntRegs:$src2, imm:$src3)>;
-}
-
-multiclass T_WWRI_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, imm:$src4),
- (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, imm:$src4)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2,
- IntRegs:$src3, imm:$src4),
- (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, imm:$src4)>;
-}
-
-multiclass T_VVVR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4),
- (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- HvxVR:$src3, IntRegs:$src4),
- (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>;
-}
-
-multiclass T_WVVR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
- HvxVR:$src3, IntRegs:$src4),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>;
-}
-
-defm : T_WR_pat <V6_vtmpyb, int_hexagon_V6_vtmpyb>;
-defm : T_WR_pat <V6_vtmpybus, int_hexagon_V6_vtmpybus>;
-defm : T_VR_pat <V6_vdmpyhb, int_hexagon_V6_vdmpyhb>;
-defm : T_VR_pat <V6_vrmpyub, int_hexagon_V6_vrmpyub>;
-defm : T_VR_pat <V6_vrmpybus, int_hexagon_V6_vrmpybus>;
-defm : T_WR_pat <V6_vdsaduh, int_hexagon_V6_vdsaduh>;
-defm : T_VR_pat <V6_vdmpybus, int_hexagon_V6_vdmpybus>;
-defm : T_WR_pat <V6_vdmpybus_dv, int_hexagon_V6_vdmpybus_dv>;
-defm : T_VR_pat <V6_vdmpyhsusat, int_hexagon_V6_vdmpyhsusat>;
-defm : T_WR_pat <V6_vdmpyhsuisat, int_hexagon_V6_vdmpyhsuisat>;
-defm : T_VR_pat <V6_vdmpyhsat, int_hexagon_V6_vdmpyhsat>;
-defm : T_WR_pat <V6_vdmpyhisat, int_hexagon_V6_vdmpyhisat>;
-defm : T_WR_pat <V6_vdmpyhb_dv, int_hexagon_V6_vdmpyhb_dv>;
-defm : T_VR_pat <V6_vmpybus, int_hexagon_V6_vmpybus>;
-defm : T_WR_pat <V6_vmpabus, int_hexagon_V6_vmpabus>;
-defm : T_WR_pat <V6_vmpahb, int_hexagon_V6_vmpahb>;
-defm : T_VR_pat <V6_vmpyh, int_hexagon_V6_vmpyh>;
-defm : T_VR_pat <V6_vmpyhss, int_hexagon_V6_vmpyhss>;
-defm : T_VR_pat <V6_vmpyhsrs, int_hexagon_V6_vmpyhsrs>;
-defm : T_VR_pat <V6_vmpyuh, int_hexagon_V6_vmpyuh>;
-defm : T_VR_pat <V6_vmpyihb, int_hexagon_V6_vmpyihb>;
-defm : T_VR_pat <V6_vror, int_hexagon_V6_vror>;
-defm : T_VR_pat <V6_vasrw, int_hexagon_V6_vasrw>;
-defm : T_VR_pat <V6_vasrh, int_hexagon_V6_vasrh>;
-defm : T_VR_pat <V6_vaslw, int_hexagon_V6_vaslw>;
-defm : T_VR_pat <V6_vaslh, int_hexagon_V6_vaslh>;
-defm : T_VR_pat <V6_vlsrw, int_hexagon_V6_vlsrw>;
-defm : T_VR_pat <V6_vlsrh, int_hexagon_V6_vlsrh>;
-defm : T_VR_pat <V6_vmpyiwh, int_hexagon_V6_vmpyiwh>;
-defm : T_VR_pat <V6_vmpyiwb, int_hexagon_V6_vmpyiwb>;
-defm : T_WR_pat <V6_vtmpyhb, int_hexagon_V6_vtmpyhb>;
-defm : T_VR_pat <V6_vmpyub, int_hexagon_V6_vmpyub>;
-
-defm : T_VV_pat <V6_vrmpyubv, int_hexagon_V6_vrmpyubv>;
-defm : T_VV_pat <V6_vrmpybv, int_hexagon_V6_vrmpybv>;
-defm : T_VV_pat <V6_vrmpybusv, int_hexagon_V6_vrmpybusv>;
-defm : T_VV_pat <V6_vdmpyhvsat, int_hexagon_V6_vdmpyhvsat>;
-defm : T_VV_pat <V6_vmpybv, int_hexagon_V6_vmpybv>;
-defm : T_VV_pat <V6_vmpyubv, int_hexagon_V6_vmpyubv>;
-defm : T_VV_pat <V6_vmpybusv, int_hexagon_V6_vmpybusv>;
-defm : T_VV_pat <V6_vmpyhv, int_hexagon_V6_vmpyhv>;
-defm : T_VV_pat <V6_vmpyuhv, int_hexagon_V6_vmpyuhv>;
-defm : T_VV_pat <V6_vmpyhvsrs, int_hexagon_V6_vmpyhvsrs>;
-defm : T_VV_pat <V6_vmpyhus, int_hexagon_V6_vmpyhus>;
-defm : T_WW_pat <V6_vmpabusv, int_hexagon_V6_vmpabusv>;
-defm : T_VV_pat <V6_vmpyih, int_hexagon_V6_vmpyih>;
-defm : T_VV_pat <V6_vand, int_hexagon_V6_vand>;
-defm : T_VV_pat <V6_vor, int_hexagon_V6_vor>;
-defm : T_VV_pat <V6_vxor, int_hexagon_V6_vxor>;
-defm : T_VV_pat <V6_vaddw, int_hexagon_V6_vaddw>;
-defm : T_VV_pat <V6_vaddubsat, int_hexagon_V6_vaddubsat>;
-defm : T_VV_pat <V6_vadduhsat, int_hexagon_V6_vadduhsat>;
-defm : T_VV_pat <V6_vaddhsat, int_hexagon_V6_vaddhsat>;
-defm : T_VV_pat <V6_vaddwsat, int_hexagon_V6_vaddwsat>;
-defm : T_VV_pat <V6_vsubb, int_hexagon_V6_vsubb>;
-defm : T_VV_pat <V6_vsubh, int_hexagon_V6_vsubh>;
-defm : T_VV_pat <V6_vsubw, int_hexagon_V6_vsubw>;
-defm : T_VV_pat <V6_vsububsat, int_hexagon_V6_vsububsat>;
-defm : T_VV_pat <V6_vsubuhsat, int_hexagon_V6_vsubuhsat>;
-defm : T_VV_pat <V6_vsubhsat, int_hexagon_V6_vsubhsat>;
-defm : T_VV_pat <V6_vsubwsat, int_hexagon_V6_vsubwsat>;
-defm : T_WW_pat <V6_vaddb_dv, int_hexagon_V6_vaddb_dv>;
-defm : T_WW_pat <V6_vaddh_dv, int_hexagon_V6_vaddh_dv>;
-defm : T_WW_pat <V6_vaddw_dv, int_hexagon_V6_vaddw_dv>;
-defm : T_WW_pat <V6_vaddubsat_dv, int_hexagon_V6_vaddubsat_dv>;
-defm : T_WW_pat <V6_vadduhsat_dv, int_hexagon_V6_vadduhsat_dv>;
-defm : T_WW_pat <V6_vaddhsat_dv, int_hexagon_V6_vaddhsat_dv>;
-defm : T_WW_pat <V6_vaddwsat_dv, int_hexagon_V6_vaddwsat_dv>;
-defm : T_WW_pat <V6_vsubb_dv, int_hexagon_V6_vsubb_dv>;
-defm : T_WW_pat <V6_vsubh_dv, int_hexagon_V6_vsubh_dv>;
-defm : T_WW_pat <V6_vsubw_dv, int_hexagon_V6_vsubw_dv>;
-defm : T_WW_pat <V6_vsububsat_dv, int_hexagon_V6_vsububsat_dv>;
-defm : T_WW_pat <V6_vsubuhsat_dv, int_hexagon_V6_vsubuhsat_dv>;
-defm : T_WW_pat <V6_vsubhsat_dv, int_hexagon_V6_vsubhsat_dv>;
-defm : T_WW_pat <V6_vsubwsat_dv, int_hexagon_V6_vsubwsat_dv>;
-defm : T_VV_pat <V6_vaddubh, int_hexagon_V6_vaddubh>;
-defm : T_VV_pat <V6_vadduhw, int_hexagon_V6_vadduhw>;
-defm : T_VV_pat <V6_vaddhw, int_hexagon_V6_vaddhw>;
-defm : T_VV_pat <V6_vsububh, int_hexagon_V6_vsububh>;
-defm : T_VV_pat <V6_vsubuhw, int_hexagon_V6_vsubuhw>;
-defm : T_VV_pat <V6_vsubhw, int_hexagon_V6_vsubhw>;
-defm : T_VV_pat <V6_vabsdiffub, int_hexagon_V6_vabsdiffub>;
-defm : T_VV_pat <V6_vabsdiffh, int_hexagon_V6_vabsdiffh>;
-defm : T_VV_pat <V6_vabsdiffuh, int_hexagon_V6_vabsdiffuh>;
-defm : T_VV_pat <V6_vabsdiffw, int_hexagon_V6_vabsdiffw>;
-defm : T_VV_pat <V6_vavgub, int_hexagon_V6_vavgub>;
-defm : T_VV_pat <V6_vavguh, int_hexagon_V6_vavguh>;
-defm : T_VV_pat <V6_vavgh, int_hexagon_V6_vavgh>;
-defm : T_VV_pat <V6_vavgw, int_hexagon_V6_vavgw>;
-defm : T_VV_pat <V6_vnavgub, int_hexagon_V6_vnavgub>;
-defm : T_VV_pat <V6_vnavgh, int_hexagon_V6_vnavgh>;
-defm : T_VV_pat <V6_vnavgw, int_hexagon_V6_vnavgw>;
-defm : T_VV_pat <V6_vavgubrnd, int_hexagon_V6_vavgubrnd>;
-defm : T_VV_pat <V6_vavguhrnd, int_hexagon_V6_vavguhrnd>;
-defm : T_VV_pat <V6_vavghrnd, int_hexagon_V6_vavghrnd>;
-defm : T_VV_pat <V6_vavgwrnd, int_hexagon_V6_vavgwrnd>;
-defm : T_WW_pat <V6_vmpabuuv, int_hexagon_V6_vmpabuuv>;
-
-defm : T_VVR_pat <V6_vdmpyhb_acc, int_hexagon_V6_vdmpyhb_acc>;
-defm : T_VVR_pat <V6_vrmpyub_acc, int_hexagon_V6_vrmpyub_acc>;
-defm : T_VVR_pat <V6_vrmpybus_acc, int_hexagon_V6_vrmpybus_acc>;
-defm : T_VVR_pat <V6_vdmpybus_acc, int_hexagon_V6_vdmpybus_acc>;
-defm : T_VVR_pat <V6_vdmpyhsusat_acc, int_hexagon_V6_vdmpyhsusat_acc>;
-defm : T_VVR_pat <V6_vdmpyhsat_acc, int_hexagon_V6_vdmpyhsat_acc>;
-defm : T_VVR_pat <V6_vmpyiwb_acc, int_hexagon_V6_vmpyiwb_acc>;
-defm : T_VVR_pat <V6_vmpyiwh_acc, int_hexagon_V6_vmpyiwh_acc>;
-defm : T_VVR_pat <V6_vmpyihb_acc, int_hexagon_V6_vmpyihb_acc>;
-defm : T_VVR_pat <V6_vaslw_acc, int_hexagon_V6_vaslw_acc>;
-defm : T_VVR_pat <V6_vasrw_acc, int_hexagon_V6_vasrw_acc>;
-
-defm : T_VWR_pat <V6_vdmpyhsuisat_acc, int_hexagon_V6_vdmpyhsuisat_acc>;
-defm : T_VWR_pat <V6_vdmpyhisat_acc, int_hexagon_V6_vdmpyhisat_acc>;
-
-defm : T_WVR_pat <V6_vmpybus_acc, int_hexagon_V6_vmpybus_acc>;
-defm : T_WVR_pat <V6_vmpyhsat_acc, int_hexagon_V6_vmpyhsat_acc>;
-defm : T_WVR_pat <V6_vmpyuh_acc, int_hexagon_V6_vmpyuh_acc>;
-defm : T_WVR_pat <V6_vmpyub_acc, int_hexagon_V6_vmpyub_acc>;
-
-defm : T_WWR_pat <V6_vtmpyb_acc, int_hexagon_V6_vtmpyb_acc>;
-defm : T_WWR_pat <V6_vtmpybus_acc, int_hexagon_V6_vtmpybus_acc>;
-defm : T_WWR_pat <V6_vtmpyhb_acc, int_hexagon_V6_vtmpyhb_acc>;
-defm : T_WWR_pat <V6_vdmpybus_dv_acc, int_hexagon_V6_vdmpybus_dv_acc>;
-defm : T_WWR_pat <V6_vdmpyhb_dv_acc, int_hexagon_V6_vdmpyhb_dv_acc>;
-defm : T_WWR_pat <V6_vmpabus_acc, int_hexagon_V6_vmpabus_acc>;
-defm : T_WWR_pat <V6_vmpahb_acc, int_hexagon_V6_vmpahb_acc>;
-defm : T_WWR_pat <V6_vdsaduh_acc, int_hexagon_V6_vdsaduh_acc>;
-
-defm : T_VVV_pat <V6_vdmpyhvsat_acc, int_hexagon_V6_vdmpyhvsat_acc>;
-defm : T_WVV_pat <V6_vmpybusv_acc, int_hexagon_V6_vmpybusv_acc>;
-defm : T_WVV_pat <V6_vmpybv_acc, int_hexagon_V6_vmpybv_acc>;
-defm : T_WVV_pat <V6_vmpyhus_acc, int_hexagon_V6_vmpyhus_acc>;
-defm : T_WVV_pat <V6_vmpyhv_acc, int_hexagon_V6_vmpyhv_acc>;
-defm : T_VVV_pat <V6_vmpyiewh_acc, int_hexagon_V6_vmpyiewh_acc>;
-defm : T_VVV_pat <V6_vmpyiewuh_acc, int_hexagon_V6_vmpyiewuh_acc>;
-defm : T_VVV_pat <V6_vmpyih_acc, int_hexagon_V6_vmpyih_acc>;
-defm : T_VVV_pat <V6_vmpyowh_rnd_sacc, int_hexagon_V6_vmpyowh_rnd_sacc>;
-defm : T_VVV_pat <V6_vmpyowh_sacc, int_hexagon_V6_vmpyowh_sacc>;
-defm : T_WVV_pat <V6_vmpyubv_acc, int_hexagon_V6_vmpyubv_acc>;
-defm : T_WVV_pat <V6_vmpyuhv_acc, int_hexagon_V6_vmpyuhv_acc>;
-defm : T_VVV_pat <V6_vrmpybusv_acc, int_hexagon_V6_vrmpybusv_acc>;
-defm : T_VVV_pat <V6_vrmpybv_acc, int_hexagon_V6_vrmpybv_acc>;
-defm : T_VVV_pat <V6_vrmpyubv_acc, int_hexagon_V6_vrmpyubv_acc>;
-
-// Compare instructions
-defm : T_QVV_pat <V6_veqb_and, int_hexagon_V6_veqb_and>;
-defm : T_QVV_pat <V6_veqh_and, int_hexagon_V6_veqh_and>;
-defm : T_QVV_pat <V6_veqw_and, int_hexagon_V6_veqw_and>;
-defm : T_QVV_pat <V6_vgtb_and, int_hexagon_V6_vgtb_and>;
-defm : T_QVV_pat <V6_vgth_and, int_hexagon_V6_vgth_and>;
-defm : T_QVV_pat <V6_vgtw_and, int_hexagon_V6_vgtw_and>;
-defm : T_QVV_pat <V6_vgtub_and, int_hexagon_V6_vgtub_and>;
-defm : T_QVV_pat <V6_vgtuh_and, int_hexagon_V6_vgtuh_and>;
-defm : T_QVV_pat <V6_vgtuw_and, int_hexagon_V6_vgtuw_and>;
-defm : T_QVV_pat <V6_veqb_or, int_hexagon_V6_veqb_or>;
-defm : T_QVV_pat <V6_veqh_or, int_hexagon_V6_veqh_or>;
-defm : T_QVV_pat <V6_veqw_or, int_hexagon_V6_veqw_or>;
-defm : T_QVV_pat <V6_vgtb_or, int_hexagon_V6_vgtb_or>;
-defm : T_QVV_pat <V6_vgth_or, int_hexagon_V6_vgth_or>;
-defm : T_QVV_pat <V6_vgtw_or, int_hexagon_V6_vgtw_or>;
-defm : T_QVV_pat <V6_vgtub_or, int_hexagon_V6_vgtub_or>;
-defm : T_QVV_pat <V6_vgtuh_or, int_hexagon_V6_vgtuh_or>;
-defm : T_QVV_pat <V6_vgtuw_or, int_hexagon_V6_vgtuw_or>;
-defm : T_QVV_pat <V6_veqb_xor, int_hexagon_V6_veqb_xor>;
-defm : T_QVV_pat <V6_veqh_xor, int_hexagon_V6_veqh_xor>;
-defm : T_QVV_pat <V6_veqw_xor, int_hexagon_V6_veqw_xor>;
-defm : T_QVV_pat <V6_vgtb_xor, int_hexagon_V6_vgtb_xor>;
-defm : T_QVV_pat <V6_vgth_xor, int_hexagon_V6_vgth_xor>;
-defm : T_QVV_pat <V6_vgtw_xor, int_hexagon_V6_vgtw_xor>;
-defm : T_QVV_pat <V6_vgtub_xor, int_hexagon_V6_vgtub_xor>;
-defm : T_QVV_pat <V6_vgtuh_xor, int_hexagon_V6_vgtuh_xor>;
-defm : T_QVV_pat <V6_vgtuw_xor, int_hexagon_V6_vgtuw_xor>;
-
-defm : T_VV_pat <V6_vminub, int_hexagon_V6_vminub>;
-defm : T_VV_pat <V6_vminuh, int_hexagon_V6_vminuh>;
-defm : T_VV_pat <V6_vminh, int_hexagon_V6_vminh>;
-defm : T_VV_pat <V6_vminw, int_hexagon_V6_vminw>;
-defm : T_VV_pat <V6_vmaxub, int_hexagon_V6_vmaxub>;
-defm : T_VV_pat <V6_vmaxuh, int_hexagon_V6_vmaxuh>;
-defm : T_VV_pat <V6_vmaxh, int_hexagon_V6_vmaxh>;
-defm : T_VV_pat <V6_vmaxw, int_hexagon_V6_vmaxw>;
-defm : T_VV_pat <V6_vdelta, int_hexagon_V6_vdelta>;
-defm : T_VV_pat <V6_vrdelta, int_hexagon_V6_vrdelta>;
-defm : T_VV_pat <V6_vdealb4w, int_hexagon_V6_vdealb4w>;
-defm : T_VV_pat <V6_vmpyowh_rnd, int_hexagon_V6_vmpyowh_rnd>;
-defm : T_VV_pat <V6_vshuffeb, int_hexagon_V6_vshuffeb>;
-defm : T_VV_pat <V6_vshuffob, int_hexagon_V6_vshuffob>;
-defm : T_VV_pat <V6_vshufeh, int_hexagon_V6_vshufeh>;
-defm : T_VV_pat <V6_vshufoh, int_hexagon_V6_vshufoh>;
-defm : T_VV_pat <V6_vshufoeh, int_hexagon_V6_vshufoeh>;
-defm : T_VV_pat <V6_vshufoeb, int_hexagon_V6_vshufoeb>;
-defm : T_VV_pat <V6_vcombine, int_hexagon_V6_vcombine>;
-defm : T_VV_pat <V6_vmpyieoh, int_hexagon_V6_vmpyieoh>;
-defm : T_VV_pat <V6_vsathub, int_hexagon_V6_vsathub>;
-defm : T_VV_pat <V6_vsatwh, int_hexagon_V6_vsatwh>;
-defm : T_VV_pat <V6_vroundwh, int_hexagon_V6_vroundwh>;
-defm : T_VV_pat <V6_vroundwuh, int_hexagon_V6_vroundwuh>;
-defm : T_VV_pat <V6_vroundhb, int_hexagon_V6_vroundhb>;
-defm : T_VV_pat <V6_vroundhub, int_hexagon_V6_vroundhub>;
-defm : T_VV_pat <V6_vasrwv, int_hexagon_V6_vasrwv>;
-defm : T_VV_pat <V6_vlsrwv, int_hexagon_V6_vlsrwv>;
-defm : T_VV_pat <V6_vlsrhv, int_hexagon_V6_vlsrhv>;
-defm : T_VV_pat <V6_vasrhv, int_hexagon_V6_vasrhv>;
-defm : T_VV_pat <V6_vaslwv, int_hexagon_V6_vaslwv>;
-defm : T_VV_pat <V6_vaslhv, int_hexagon_V6_vaslhv>;
-defm : T_VV_pat <V6_vaddb, int_hexagon_V6_vaddb>;
-defm : T_VV_pat <V6_vaddh, int_hexagon_V6_vaddh>;
-defm : T_VV_pat <V6_vmpyiewuh, int_hexagon_V6_vmpyiewuh>;
-defm : T_VV_pat <V6_vmpyiowh, int_hexagon_V6_vmpyiowh>;
-defm : T_VV_pat <V6_vpackeb, int_hexagon_V6_vpackeb>;
-defm : T_VV_pat <V6_vpackeh, int_hexagon_V6_vpackeh>;
-defm : T_VV_pat <V6_vpackhub_sat, int_hexagon_V6_vpackhub_sat>;
-defm : T_VV_pat <V6_vpackhb_sat, int_hexagon_V6_vpackhb_sat>;
-defm : T_VV_pat <V6_vpackwuh_sat, int_hexagon_V6_vpackwuh_sat>;
-defm : T_VV_pat <V6_vpackwh_sat, int_hexagon_V6_vpackwh_sat>;
-defm : T_VV_pat <V6_vpackob, int_hexagon_V6_vpackob>;
-defm : T_VV_pat <V6_vpackoh, int_hexagon_V6_vpackoh>;
-defm : T_VV_pat <V6_vmpyewuh, int_hexagon_V6_vmpyewuh>;
-defm : T_VV_pat <V6_vmpyowh, int_hexagon_V6_vmpyowh>;
-
-defm : T_QVV_pat <V6_vaddbq, int_hexagon_V6_vaddbq>;
-defm : T_QVV_pat <V6_vaddhq, int_hexagon_V6_vaddhq>;
-defm : T_QVV_pat <V6_vaddwq, int_hexagon_V6_vaddwq>;
-defm : T_QVV_pat <V6_vaddbnq, int_hexagon_V6_vaddbnq>;
-defm : T_QVV_pat <V6_vaddhnq, int_hexagon_V6_vaddhnq>;
-defm : T_QVV_pat <V6_vaddwnq, int_hexagon_V6_vaddwnq>;
-defm : T_QVV_pat <V6_vsubbq, int_hexagon_V6_vsubbq>;
-defm : T_QVV_pat <V6_vsubhq, int_hexagon_V6_vsubhq>;
-defm : T_QVV_pat <V6_vsubwq, int_hexagon_V6_vsubwq>;
-defm : T_QVV_pat <V6_vsubbnq, int_hexagon_V6_vsubbnq>;
-defm : T_QVV_pat <V6_vsubhnq, int_hexagon_V6_vsubhnq>;
-defm : T_QVV_pat <V6_vsubwnq, int_hexagon_V6_vsubwnq>;
-
-defm : T_V_pat <V6_vabsh, int_hexagon_V6_vabsh>;
-defm : T_V_pat <V6_vabsw, int_hexagon_V6_vabsw>;
-defm : T_V_pat <V6_vabsw_sat, int_hexagon_V6_vabsw_sat>;
-defm : T_V_pat <V6_vabsh_sat, int_hexagon_V6_vabsh_sat>;
-defm : T_V_pat <V6_vnot, int_hexagon_V6_vnot>;
-defm : T_V_pat <V6_vassign, int_hexagon_V6_vassign>;
-defm : T_V_pat <V6_vzb, int_hexagon_V6_vzb>;
-defm : T_V_pat <V6_vzh, int_hexagon_V6_vzh>;
-defm : T_V_pat <V6_vsb, int_hexagon_V6_vsb>;
-defm : T_V_pat <V6_vsh, int_hexagon_V6_vsh>;
-defm : T_V_pat <V6_vdealh, int_hexagon_V6_vdealh>;
-defm : T_V_pat <V6_vdealb, int_hexagon_V6_vdealb>;
-defm : T_V_pat <V6_vunpackub, int_hexagon_V6_vunpackub>;
-defm : T_V_pat <V6_vunpackuh, int_hexagon_V6_vunpackuh>;
-defm : T_V_pat <V6_vunpackb, int_hexagon_V6_vunpackb>;
-defm : T_V_pat <V6_vunpackh, int_hexagon_V6_vunpackh>;
-defm : T_V_pat <V6_vshuffh, int_hexagon_V6_vshuffh>;
-defm : T_V_pat <V6_vshuffb, int_hexagon_V6_vshuffb>;
-defm : T_V_pat <V6_vcl0w, int_hexagon_V6_vcl0w>;
-defm : T_V_pat <V6_vpopcounth, int_hexagon_V6_vpopcounth>;
-defm : T_V_pat <V6_vcl0h, int_hexagon_V6_vcl0h>;
-defm : T_V_pat <V6_vnormamtw, int_hexagon_V6_vnormamtw>;
-defm : T_V_pat <V6_vnormamth, int_hexagon_V6_vnormamth>;
-
-defm : T_W_pat <V6_lo, int_hexagon_V6_lo>;
-defm : T_W_pat <V6_hi, int_hexagon_V6_hi>;
-defm : T_W_pat <V6_vassignp, int_hexagon_V6_vassignp>;
-
-defm : T_WRI_pat <V6_vrmpybusi, int_hexagon_V6_vrmpybusi>;
-defm : T_WRI_pat <V6_vrsadubi, int_hexagon_V6_vrsadubi>;
-defm : T_WRI_pat <V6_vrmpyubi, int_hexagon_V6_vrmpyubi>;
-
-defm : T_WWRI_pat <V6_vrmpybusi_acc, int_hexagon_V6_vrmpybusi_acc>;
-defm : T_WWRI_pat <V6_vrsadubi_acc, int_hexagon_V6_vrsadubi_acc>;
-defm : T_WWRI_pat <V6_vrmpyubi_acc, int_hexagon_V6_vrmpyubi_acc>;
-
-// assembler mapped.
-//defm : T_V_pat <V6_vtran2x2, int_hexagon_V6_vtran2x2>;
-// not present earlier.. need to add intrinsic
-defm : T_VVR_pat <V6_valignb, int_hexagon_V6_valignb>;
-defm : T_VVR_pat <V6_vlalignb, int_hexagon_V6_vlalignb>;
-defm : T_VVR_pat <V6_vasrwh, int_hexagon_V6_vasrwh>;
-defm : T_VVR_pat <V6_vasrwhsat, int_hexagon_V6_vasrwhsat>;
-defm : T_VVR_pat <V6_vasrwhrndsat, int_hexagon_V6_vasrwhrndsat>;
-defm : T_VVR_pat <V6_vasrwuhsat, int_hexagon_V6_vasrwuhsat>;
-defm : T_VVR_pat <V6_vasrhubsat, int_hexagon_V6_vasrhubsat>;
-defm : T_VVR_pat <V6_vasrhubrndsat, int_hexagon_V6_vasrhubrndsat>;
-defm : T_VVR_pat <V6_vasrhbrndsat, int_hexagon_V6_vasrhbrndsat>;
-
-defm : T_VVR_pat <V6_vshuffvdd, int_hexagon_V6_vshuffvdd>;
-defm : T_VVR_pat <V6_vdealvdd, int_hexagon_V6_vdealvdd>;
-
-defm : T_WV_pat <V6_vunpackob, int_hexagon_V6_vunpackob>;
-defm : T_WV_pat <V6_vunpackoh, int_hexagon_V6_vunpackoh>;
-defm : T_VVI_pat <V6_valignbi, int_hexagon_V6_valignbi>;
-defm : T_VVI_pat <V6_vlalignbi, int_hexagon_V6_vlalignbi>;
-
-defm : T_QVV_pat <V6_vswap, int_hexagon_V6_vswap>;
-defm : T_QVV_pat <V6_vmux, int_hexagon_V6_vmux>;
-defm : T_QQ_pat <V6_pred_and, int_hexagon_V6_pred_and>;
-defm : T_QQ_pat <V6_pred_or, int_hexagon_V6_pred_or>;
-defm : T_Q_pat <V6_pred_not, int_hexagon_V6_pred_not>;
-defm : T_QQ_pat <V6_pred_xor, int_hexagon_V6_pred_xor>;
-defm : T_QQ_pat <V6_pred_or_n, int_hexagon_V6_pred_or_n>;
-defm : T_QQ_pat <V6_pred_and_n, int_hexagon_V6_pred_and_n>;
-defm : T_VV_pat <V6_veqb, int_hexagon_V6_veqb>;
-defm : T_VV_pat <V6_veqh, int_hexagon_V6_veqh>;
-defm : T_VV_pat <V6_veqw, int_hexagon_V6_veqw>;
-defm : T_VV_pat <V6_vgtb, int_hexagon_V6_vgtb>;
-defm : T_VV_pat <V6_vgth, int_hexagon_V6_vgth>;
-defm : T_VV_pat <V6_vgtw, int_hexagon_V6_vgtw>;
-defm : T_VV_pat <V6_vgtub, int_hexagon_V6_vgtub>;
-defm : T_VV_pat <V6_vgtuh, int_hexagon_V6_vgtuh>;
-defm : T_VV_pat <V6_vgtuw, int_hexagon_V6_vgtuw>;
-
-defm : T_VQR_pat <V6_vandqrt_acc, int_hexagon_V6_vandqrt_acc>;
-defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
-defm : T_QR_pat <V6_vandqrt, int_hexagon_V6_vandqrt>;
-defm : T_R_pat <V6_lvsplatw, int_hexagon_V6_lvsplatw>;
-defm : T_R_pat <V6_pred_scalar2, int_hexagon_V6_pred_scalar2>;
-defm : T_VR_pat <V6_vandvrt, int_hexagon_V6_vandvrt>;
-
-defm : T_VVR_pat <V6_vlutvvb, int_hexagon_V6_vlutvvb>;
-defm : T_VVR_pat <V6_vlutvwh, int_hexagon_V6_vlutvwh>;
-defm : T_VVVR_pat <V6_vlutvvb_oracc, int_hexagon_V6_vlutvvb_oracc>;
-defm : T_WVVR_pat <V6_vlutvwh_oracc, int_hexagon_V6_vlutvwh_oracc>;
-
-defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
-def : T_PI_pat <S6_rol_i_p, int_hexagon_S6_rol_i_p>;
-def : T_RI_pat <S6_rol_i_r, int_hexagon_S6_rol_i_r>;
-def : T_PPI_pat <S6_rol_i_p_nac, int_hexagon_S6_rol_i_p_nac>;
-def : T_PPI_pat <S6_rol_i_p_acc, int_hexagon_S6_rol_i_p_acc>;
-def : T_PPI_pat <S6_rol_i_p_and, int_hexagon_S6_rol_i_p_and>;
-def : T_PPI_pat <S6_rol_i_p_or, int_hexagon_S6_rol_i_p_or>;
-def : T_PPI_pat <S6_rol_i_p_xacc, int_hexagon_S6_rol_i_p_xacc>;
-def : T_RRI_pat <S6_rol_i_r_nac, int_hexagon_S6_rol_i_r_nac>;
-def : T_RRI_pat <S6_rol_i_r_acc, int_hexagon_S6_rol_i_r_acc>;
-def : T_RRI_pat <S6_rol_i_r_and, int_hexagon_S6_rol_i_r_and>;
-def : T_RRI_pat <S6_rol_i_r_or, int_hexagon_S6_rol_i_r_or>;
-def : T_RRI_pat <S6_rol_i_r_xacc, int_hexagon_S6_rol_i_r_xacc>;
-
-defm : T_VR_pat <V6_extractw, int_hexagon_V6_extractw>;
-defm : T_VR_pat <V6_vinsertwr, int_hexagon_V6_vinsertwr>;
-
-//def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>;
-
-def: Pat<(v64i16 (trunc v64i32:$Vdd)),
- (v64i16 (V6_vpackwh_sat
- (v32i32 (V6_hi HvxWR:$Vdd)),
- (v32i32 (V6_lo HvxWR:$Vdd))))>;
-
-def: Pat<(int_hexagon_V6_vd0), (V6_vd0)>;
-def: Pat<(int_hexagon_V6_vd0_128B), (V6_vd0)>;
-
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index c2eb24b..c34eecd 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -38,7 +38,6 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsHexagon.h"
#include "llvm/IR/Module.h"
diff --git a/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td b/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
deleted file mode 100644
index 2fcefe6..0000000
--- a/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
+++ /dev/null
@@ -1,179 +0,0 @@
-//===--- HexagonMapAsm2IntrinV62.gen.td -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-multiclass T_VR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, IntRegs:$src2),
- (MI HvxVR:$src1, IntRegs:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, IntRegs:$src2),
- (MI HvxVR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_VVL_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- IntRegsLow8:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>;
-}
-
-multiclass T_VV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2),
- (MI HvxVR:$src1, HvxVR:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2),
- (MI HvxVR:$src1, HvxVR:$src2)>;
-}
-
-multiclass T_WW_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxWR:$src2),
- (MI HvxWR:$src1, HvxWR:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2),
- (MI HvxWR:$src1, HvxWR:$src2)>;
-}
-
-multiclass T_WVV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
- HvxVR:$src3),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-}
-
-multiclass T_WR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, IntRegs:$src2),
- (MI HvxWR:$src1, IntRegs:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, IntRegs:$src2),
- (MI HvxWR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_WWR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2,
- IntRegs:$src3),
- (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VVR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- IntRegs:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_ZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, IntRegs:$src2),
- (MI HvxQR:$src1, IntRegs:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, IntRegs:$src2),
- (MI HvxQR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_VZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
- (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxQR:$src2,
- IntRegs:$src3),
- (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_ZV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, HvxVR:$src2),
- (MI HvxQR:$src1, HvxVR:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxVR:$src2),
- (MI HvxQR:$src1, HvxVR:$src2)>;
-}
-
-multiclass T_R_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID IntRegs:$src1),
- (MI IntRegs:$src1)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1),
- (MI IntRegs:$src1)>;
-}
-
-multiclass T_ZZ_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, HvxQR:$src2),
- (MI HvxQR:$src1, HvxQR:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxQR:$src2),
- (MI HvxQR:$src1, HvxQR:$src2)>;
-}
-
-multiclass T_VVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, imm:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- imm:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>;
-}
-
-multiclass T_VVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4),
- (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- HvxVR:$src3, imm:$src4),
- (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>;
-}
-
-multiclass T_WVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
- HvxVR:$src3, imm:$src4),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>;
-}
-
-def : T_R_pat <S6_vsplatrbp, int_hexagon_S6_vsplatrbp>;
-def : T_PP_pat <M6_vabsdiffb, int_hexagon_M6_vabsdiffb>;
-def : T_PP_pat <M6_vabsdiffub, int_hexagon_M6_vabsdiffub>;
-def : T_PP_pat <S6_vtrunehb_ppp, int_hexagon_S6_vtrunehb_ppp>;
-def : T_PP_pat <S6_vtrunohb_ppp, int_hexagon_S6_vtrunohb_ppp>;
-
-defm : T_VR_HVX_gen_pat <V6_vlsrb, int_hexagon_V6_vlsrb>;
-defm : T_VR_HVX_gen_pat <V6_vmpyiwub, int_hexagon_V6_vmpyiwub>;
-defm : T_VVL_HVX_gen_pat <V6_vasrwuhrndsat, int_hexagon_V6_vasrwuhrndsat>;
-defm : T_VVL_HVX_gen_pat <V6_vasruwuhrndsat, int_hexagon_V6_vasruwuhrndsat>;
-defm : T_VVL_HVX_gen_pat <V6_vasrhbsat, int_hexagon_V6_vasrhbsat>;
-defm : T_VVL_HVX_gen_pat <V6_vlutvvb_nm, int_hexagon_V6_vlutvvb_nm>;
-defm : T_VVL_HVX_gen_pat <V6_vlutvwh_nm, int_hexagon_V6_vlutvwh_nm>;
-defm : T_VV_HVX_gen_pat <V6_vrounduwuh, int_hexagon_V6_vrounduwuh>;
-defm : T_VV_HVX_gen_pat <V6_vrounduhub, int_hexagon_V6_vrounduhub>;
-defm : T_VV_HVX_gen_pat <V6_vadduwsat, int_hexagon_V6_vadduwsat>;
-defm : T_VV_HVX_gen_pat <V6_vsubuwsat, int_hexagon_V6_vsubuwsat>;
-defm : T_VV_HVX_gen_pat <V6_vaddbsat, int_hexagon_V6_vaddbsat>;
-defm : T_VV_HVX_gen_pat <V6_vsubbsat, int_hexagon_V6_vsubbsat>;
-defm : T_VV_HVX_gen_pat <V6_vaddububb_sat, int_hexagon_V6_vaddububb_sat>;
-defm : T_VV_HVX_gen_pat <V6_vsubububb_sat, int_hexagon_V6_vsubububb_sat>;
-defm : T_VV_HVX_gen_pat <V6_vmpyewuh_64, int_hexagon_V6_vmpyewuh_64>;
-defm : T_VV_HVX_gen_pat <V6_vmaxb, int_hexagon_V6_vmaxb>;
-defm : T_VV_HVX_gen_pat <V6_vminb, int_hexagon_V6_vminb>;
-defm : T_VV_HVX_gen_pat <V6_vsatuwuh, int_hexagon_V6_vsatuwuh>;
-defm : T_VV_HVX_gen_pat <V6_vaddclbw, int_hexagon_V6_vaddclbw>;
-defm : T_VV_HVX_gen_pat <V6_vaddclbh, int_hexagon_V6_vaddclbh>;
-defm : T_WW_HVX_gen_pat <V6_vadduwsat_dv, int_hexagon_V6_vadduwsat_dv>;
-defm : T_WW_HVX_gen_pat <V6_vsubuwsat_dv, int_hexagon_V6_vsubuwsat_dv>;
-defm : T_WW_HVX_gen_pat <V6_vaddbsat_dv, int_hexagon_V6_vaddbsat_dv>;
-defm : T_WW_HVX_gen_pat <V6_vsubbsat_dv, int_hexagon_V6_vsubbsat_dv>;
-defm : T_WVV_HVX_gen_pat <V6_vaddhw_acc, int_hexagon_V6_vaddhw_acc>;
-defm : T_WVV_HVX_gen_pat <V6_vadduhw_acc, int_hexagon_V6_vadduhw_acc>;
-defm : T_WVV_HVX_gen_pat <V6_vaddubh_acc, int_hexagon_V6_vaddubh_acc>;
-defm : T_WVV_HVX_gen_pat <V6_vmpyowh_64_acc, int_hexagon_V6_vmpyowh_64_acc>;
-defm : T_WR_HVX_gen_pat <V6_vmpauhb, int_hexagon_V6_vmpauhb>;
-defm : T_WWR_HVX_gen_pat <V6_vmpauhb_acc, int_hexagon_V6_vmpauhb_acc>;
-defm : T_VVR_HVX_gen_pat <V6_vmpyiwub_acc, int_hexagon_V6_vmpyiwub_acc>;
-defm : T_ZR_HVX_gen_pat <V6_vandnqrt, int_hexagon_V6_vandnqrt>;
-defm : T_VZR_HVX_gen_pat <V6_vandnqrt_acc, int_hexagon_V6_vandnqrt_acc>;
-defm : T_ZV_HVX_gen_pat <V6_vandvqv, int_hexagon_V6_vandvqv>;
-defm : T_ZV_HVX_gen_pat <V6_vandvnqv, int_hexagon_V6_vandvnqv>;
-defm : T_R_HVX_gen_pat <V6_pred_scalar2v2, int_hexagon_V6_pred_scalar2v2>;
-defm : T_R_HVX_gen_pat <V6_lvsplath, int_hexagon_V6_lvsplath>;
-defm : T_R_HVX_gen_pat <V6_lvsplatb, int_hexagon_V6_lvsplatb>;
-defm : T_ZZ_HVX_gen_pat <V6_shuffeqw, int_hexagon_V6_shuffeqw>;
-defm : T_ZZ_HVX_gen_pat <V6_shuffeqh, int_hexagon_V6_shuffeqh>;
-defm : T_VVI_HVX_gen_pat <V6_vlutvvbi, int_hexagon_V6_vlutvvbi>;
-defm : T_VVI_HVX_gen_pat <V6_vlutvwhi, int_hexagon_V6_vlutvwhi>;
-defm : T_VVVI_HVX_gen_pat <V6_vlutvvb_oracci, int_hexagon_V6_vlutvvb_oracci>;
-defm : T_WVVI_HVX_gen_pat <V6_vlutvwh_oracci, int_hexagon_V6_vlutvwh_oracci>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 2378664..e915a3c4 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2514,8 +2514,9 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
assert(ResTy.isVector());
unsigned NumElts = ResTy.getVectorNumElements();
- SDValue Vector = DAG.getUNDEF(ResTy);
- for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Vector =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Node->getOperand(0));
+ for (unsigned i = 1; i < NumElts; ++i) {
Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
Node->getOperand(i),
DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
@@ -4560,6 +4561,80 @@ static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
llvm_unreachable("Unexpected node type for vXi1 sign extension");
}
+static SDValue
+performSETCC_BITCASTCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const LoongArchSubtarget &Subtarget) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ if (Src.getOpcode() != ISD::SETCC || !Src.hasOneUse())
+ return SDValue();
+
+ bool UseLASX;
+ unsigned Opc = ISD::DELETED_NODE;
+ EVT CmpVT = Src.getOperand(0).getValueType();
+ EVT EltVT = CmpVT.getVectorElementType();
+
+ if (Subtarget.hasExtLSX() && CmpVT.getSizeInBits() == 128)
+ UseLASX = false;
+ else if (Subtarget.has32S() && Subtarget.hasExtLASX() &&
+ CmpVT.getSizeInBits() == 256)
+ UseLASX = true;
+ else
+ return SDValue();
+
+ SDValue SrcN1 = Src.getOperand(1);
+ switch (cast<CondCodeSDNode>(Src.getOperand(2))->get()) {
+ default:
+ break;
+ case ISD::SETEQ:
+ // x == 0 => not (vmsknez.b x)
+ if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+ Opc = UseLASX ? LoongArchISD::XVMSKEQZ : LoongArchISD::VMSKEQZ;
+ break;
+ case ISD::SETGT:
+ // x > -1 => vmskgez.b x
+ if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && EltVT == MVT::i8)
+ Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
+ break;
+ case ISD::SETGE:
+ // x >= 0 => vmskgez.b x
+ if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+ Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
+ break;
+ case ISD::SETLT:
+ // x < 0 => vmskltz.{b,h,w,d} x
+ if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) &&
+ (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
+ EltVT == MVT::i64))
+ Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
+ break;
+ case ISD::SETLE:
+ // x <= -1 => vmskltz.{b,h,w,d} x
+ if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) &&
+ (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
+ EltVT == MVT::i64))
+ Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
+ break;
+ case ISD::SETNE:
+ // x != 0 => vmsknez.b x
+ if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+ Opc = UseLASX ? LoongArchISD::XVMSKNEZ : LoongArchISD::VMSKNEZ;
+ break;
+ }
+
+ if (Opc == ISD::DELETED_NODE)
+ return SDValue();
+
+ SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src.getOperand(0));
+ EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
+ V = DAG.getZExtOrTrunc(V, DL, T);
+ return DAG.getBitcast(VT, V);
+}
+
static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const LoongArchSubtarget &Subtarget) {
@@ -4574,110 +4649,63 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG,
if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
return SDValue();
- unsigned Opc = ISD::DELETED_NODE;
// Combine SETCC and BITCAST into [X]VMSK{LT,GE,NE} when possible
+ SDValue Res = performSETCC_BITCASTCombine(N, DAG, DCI, Subtarget);
+ if (Res)
+ return Res;
+
+ // Generate vXi1 using [X]VMSKLTZ
+ MVT SExtVT;
+ unsigned Opc;
+ bool UseLASX = false;
+ bool PropagateSExt = false;
+
if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse()) {
- bool UseLASX;
EVT CmpVT = Src.getOperand(0).getValueType();
- EVT EltVT = CmpVT.getVectorElementType();
-
- if (Subtarget.hasExtLSX() && CmpVT.getSizeInBits() <= 128)
- UseLASX = false;
- else if (Subtarget.has32S() && Subtarget.hasExtLASX() &&
- CmpVT.getSizeInBits() <= 256)
- UseLASX = true;
- else
+ if (CmpVT.getSizeInBits() > 256)
return SDValue();
-
- SDValue SrcN1 = Src.getOperand(1);
- switch (cast<CondCodeSDNode>(Src.getOperand(2))->get()) {
- default:
- break;
- case ISD::SETEQ:
- // x == 0 => not (vmsknez.b x)
- if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
- Opc = UseLASX ? LoongArchISD::XVMSKEQZ : LoongArchISD::VMSKEQZ;
- break;
- case ISD::SETGT:
- // x > -1 => vmskgez.b x
- if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && EltVT == MVT::i8)
- Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
- break;
- case ISD::SETGE:
- // x >= 0 => vmskgez.b x
- if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
- Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
- break;
- case ISD::SETLT:
- // x < 0 => vmskltz.{b,h,w,d} x
- if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) &&
- (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
- EltVT == MVT::i64))
- Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
- break;
- case ISD::SETLE:
- // x <= -1 => vmskltz.{b,h,w,d} x
- if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) &&
- (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
- EltVT == MVT::i64))
- Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
- break;
- case ISD::SETNE:
- // x != 0 => vmsknez.b x
- if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
- Opc = UseLASX ? LoongArchISD::XVMSKNEZ : LoongArchISD::VMSKNEZ;
- break;
- }
}
- // Generate vXi1 using [X]VMSKLTZ
- if (Opc == ISD::DELETED_NODE) {
- MVT SExtVT;
- bool UseLASX = false;
- bool PropagateSExt = false;
- switch (SrcVT.getSimpleVT().SimpleTy) {
- default:
- return SDValue();
- case MVT::v2i1:
- SExtVT = MVT::v2i64;
- break;
- case MVT::v4i1:
- SExtVT = MVT::v4i32;
- if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
- SExtVT = MVT::v4i64;
- UseLASX = true;
- PropagateSExt = true;
- }
- break;
- case MVT::v8i1:
- SExtVT = MVT::v8i16;
- if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
- SExtVT = MVT::v8i32;
- UseLASX = true;
- PropagateSExt = true;
- }
- break;
- case MVT::v16i1:
- SExtVT = MVT::v16i8;
- if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
- SExtVT = MVT::v16i16;
- UseLASX = true;
- PropagateSExt = true;
- }
- break;
- case MVT::v32i1:
- SExtVT = MVT::v32i8;
+ switch (SrcVT.getSimpleVT().SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::v2i1:
+ SExtVT = MVT::v2i64;
+ break;
+ case MVT::v4i1:
+ SExtVT = MVT::v4i32;
+ if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+ SExtVT = MVT::v4i64;
UseLASX = true;
- break;
- };
- if (UseLASX && !Subtarget.has32S() && !Subtarget.hasExtLASX())
- return SDValue();
- Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
- : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
- Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
- } else {
- Src = Src.getOperand(0);
- }
+ PropagateSExt = true;
+ }
+ break;
+ case MVT::v8i1:
+ SExtVT = MVT::v8i16;
+ if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+ SExtVT = MVT::v8i32;
+ UseLASX = true;
+ PropagateSExt = true;
+ }
+ break;
+ case MVT::v16i1:
+ SExtVT = MVT::v16i8;
+ if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+ SExtVT = MVT::v16i16;
+ UseLASX = true;
+ PropagateSExt = true;
+ }
+ break;
+ case MVT::v32i1:
+ SExtVT = MVT::v32i8;
+ UseLASX = true;
+ break;
+ };
+ if (UseLASX && !(Subtarget.has32S() && Subtarget.hasExtLASX()))
+ return SDValue();
+ Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
+ : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
+ Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src);
EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index a0107e4..5096a8f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1651,18 +1651,20 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
(XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
(XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
-def : Pat<(vector_insert v8f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm),
- (XVINSGR2VR_W $vd, $rj, uimm3:$imm)>;
-def : Pat<(vector_insert v4f64:$vd, (f64 (bitconvert i64:$rj)), uimm2:$imm),
- (XVINSGR2VR_D $vd, $rj, uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$xd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm),
+ (XVINSGR2VR_W $xd, $rj, uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, (f64 (bitconvert i64:$rj)), uimm2:$imm),
+ (XVINSGR2VR_D $xd, $rj, uimm2:$imm)>;
def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2),
(XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>;
def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2),
(XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>;
+
+// XVINSVE0_{W/D}
def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
- (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
+ (XVINSVE0_W $xd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm),
- (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
+ (XVINSVE0_D $xd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>;
// scalar_to_vector
def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 962e7c2..3c9defb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1842,10 +1842,19 @@ def : Pat<(vector_insert v4f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm2:$
(VINSGR2VR_W $vd, $rj, uimm2:$imm)>;
def : Pat<(vector_insert v2f64:$vd, (f64 (bitconvert i64:$rj)), uimm1:$imm),
(VINSGR2VR_D $vd, $rj, uimm1:$imm)>;
-def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm),
- (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>;
-def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm),
- (VINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm1:$imm)>;
+
+// VEXTRINS_{W/D}
+foreach imm = 0...3 in {
+ defvar Imm = !shl(imm, 4);
+ def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, imm),
+ (VEXTRINS_W $vd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), Imm)>;
+}
+
+foreach imm = 0...1 in {
+ defvar Imm = !shl(imm, 4);
+ def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, imm),
+ (VEXTRINS_D $vd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), Imm)>;
+}
// scalar_to_vector
def : Pat<(v4f32 (scalar_to_vector FPR32:$fj)),
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index 7b9f115..8fa72bc 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -177,74 +177,6 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
}
}
-// Linker relaxation may change code size. We have to insert Nops
-// for .align directive when linker relaxation enabled. So then Linker
-// could satisfy alignment by removing Nops.
-// The function returns the total Nops Size we need to insert.
-bool LoongArchAsmBackend::shouldInsertExtraNopBytesForCodeAlign(
- const MCAlignFragment &AF, unsigned &Size) {
- // Calculate Nops Size only when linker relaxation enabled.
- if (!AF.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
- return false;
-
- // Ignore alignment if MaxBytesToEmit is less than the minimum Nop size.
- const unsigned MinNopLen = 4;
- if (AF.getMaxBytesToEmit() < MinNopLen)
- return false;
- Size = AF.getAlignment().value() - MinNopLen;
- return AF.getAlignment() > MinNopLen;
-}
-
-// We need to insert R_LARCH_ALIGN relocation type to indicate the
-// position of Nops and the total bytes of the Nops have been inserted
-// when linker relaxation enabled.
-// The function inserts fixup_loongarch_align fixup which eventually will
-// transfer to R_LARCH_ALIGN relocation type.
-// The improved R_LARCH_ALIGN requires symbol index. The lowest 8 bits of
-// addend represent alignment and the other bits of addend represent the
-// maximum number of bytes to emit. The maximum number of bytes is zero
-// means ignore the emit limit.
-bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm,
- MCAlignFragment &AF) {
- // Insert the fixup only when linker relaxation enabled.
- if (!AF.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
- return false;
-
- // Calculate total Nops we need to insert. If there are none to insert
- // then simply return.
- unsigned InsertedNopBytes;
- if (!shouldInsertExtraNopBytesForCodeAlign(AF, InsertedNopBytes))
- return false;
-
- MCSection *Sec = AF.getParent();
- MCContext &Ctx = getContext();
- const MCExpr *Dummy = MCConstantExpr::create(0, Ctx);
- MCFixup Fixup = MCFixup::create(0, Dummy, ELF::R_LARCH_ALIGN);
- unsigned MaxBytesToEmit = AF.getMaxBytesToEmit();
-
- auto createExtendedValue = [&]() {
- const MCSymbolRefExpr *MCSym = getSecToAlignSym()[Sec];
- if (MCSym == nullptr) {
- // Define a marker symbol at the section with an offset of 0.
- MCSymbol *Sym = Ctx.createNamedTempSymbol("la-relax-align");
- Sym->setFragment(&*Sec->getBeginSymbol()->getFragment());
- Asm.registerSymbol(*Sym);
- MCSym = MCSymbolRefExpr::create(Sym, Ctx);
- getSecToAlignSym()[Sec] = MCSym;
- }
- return MCValue::get(&MCSym->getSymbol(), nullptr,
- MaxBytesToEmit << 8 | Log2(AF.getAlignment()));
- };
-
- uint64_t FixedValue = 0;
- MCValue Value = MaxBytesToEmit >= InsertedNopBytes
- ? MCValue::get(InsertedNopBytes)
- : createExtendedValue();
- Asm.getWriter().recordRelocation(AF, Fixup, Value, FixedValue);
-
- return true;
-}
-
bool LoongArchAsmBackend::shouldForceRelocation(const MCFixup &Fixup,
const MCValue &Target) {
switch (Fixup.getKind()) {
@@ -279,6 +211,53 @@ getRelocPairForSize(unsigned Size) {
}
}
+// Check if an R_LARCH_ALIGN relocation is needed for an alignment directive.
+// If conditions are met, compute the padding size and create a fixup encoding
+// the padding size in the addend. If MaxBytesToEmit is smaller than the padding
+// size, the fixup encodes MaxBytesToEmit in the higher bits and references a
+// per-section marker symbol.
+bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
+ // Use default handling unless linker relaxation is enabled and the
+ // MaxBytesToEmit >= the nop size.
+ if (!F.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
+ return false;
+ const unsigned MinNopLen = 4;
+ unsigned MaxBytesToEmit = F.getAlignMaxBytesToEmit();
+ if (MaxBytesToEmit < MinNopLen)
+ return false;
+
+ Size = F.getAlignment().value() - MinNopLen;
+ if (F.getAlignment() <= MinNopLen)
+ return false;
+
+ MCContext &Ctx = getContext();
+ const MCExpr *Expr = nullptr;
+ if (MaxBytesToEmit >= Size) {
+ Expr = MCConstantExpr::create(Size, getContext());
+ } else {
+ MCSection *Sec = F.getParent();
+ const MCSymbolRefExpr *SymRef = getSecToAlignSym()[Sec];
+ if (SymRef == nullptr) {
+ // Define a marker symbol at the section with an offset of 0.
+ MCSymbol *Sym = Ctx.createNamedTempSymbol("la-relax-align");
+ Sym->setFragment(&*Sec->getBeginSymbol()->getFragment());
+ Asm->registerSymbol(*Sym);
+ SymRef = MCSymbolRefExpr::create(Sym, Ctx);
+ getSecToAlignSym()[Sec] = SymRef;
+ }
+ Expr = MCBinaryExpr::createAdd(
+ SymRef,
+ MCConstantExpr::create((MaxBytesToEmit << 8) | Log2(F.getAlignment()),
+ Ctx),
+ Ctx);
+ }
+ MCFixup Fixup =
+ MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN);
+ F.setVarFixups({Fixup});
+ F.getParent()->setLinkerRelaxable();
+ return true;
+}
+
std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCFragment &F,
int64_t &Value) const {
const MCExpr &Expr = F.getLEBValue();
@@ -434,7 +413,7 @@ bool LoongArchAsmBackend::isPCRelFixupResolved(const MCSymbol *SymA,
// Otherwise, check if the offset between the symbol and fragment is fully
// resolved, unaffected by linker-relaxable fragments (e.g. instructions or
- // offset-affected MCAlignFragment). Complements the generic
+ // offset-affected FT_Align fragments). Complements the generic
// isSymbolRefDifferenceFullyResolvedImpl.
if (!PCRelTemp)
PCRelTemp = getContext().createTempSymbol();
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index b32ba06..3d929fc 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -45,20 +45,13 @@ public:
MutableArrayRef<char> Data, uint64_t Value,
bool IsResolved) override;
- // Return Size with extra Nop Bytes for alignment directive in code section.
- bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF,
- unsigned &Size) override;
-
- // Insert target specific fixup type for alignment directive in code section.
- bool shouldInsertFixupForCodeAlign(MCAssembler &Asm,
- MCAlignFragment &AF) override;
-
bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target);
std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
+ bool relaxAlign(MCFragment &F, unsigned &Size) override;
bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override;
bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override;
std::pair<bool, bool> relaxLEB128(MCFragment &F,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
index 03ce004..7cefb3f 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
@@ -52,6 +52,9 @@ static ABI getTripleABI(const Triple &TT) {
bool Is64Bit = TT.isArch64Bit();
ABI TripleABI;
switch (TT.getEnvironment()) {
+ case llvm::Triple::EnvironmentType::UnknownEnvironment:
+ TripleABI = ABI_Unknown;
+ break;
case llvm::Triple::EnvironmentType::GNUSF:
case llvm::Triple::EnvironmentType::MuslSF:
TripleABI = Is64Bit ? ABI_LP64S : ABI_ILP32S;
@@ -96,7 +99,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
// 1. If the '-target-abi' is valid, use it.
if (IsABIValidForFeature(ArgProvidedABI)) {
- if (TT.hasEnvironment() && ArgProvidedABI != TripleABI)
+ if (IsABIValidForFeature(TripleABI) && ArgProvidedABI != TripleABI)
errs()
<< "warning: triple-implied ABI conflicts with provided target-abi '"
<< ABIName << "', using target-abi\n";
@@ -164,10 +167,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
return Is64Bit ? ABI_LP64F : ABI_ILP32F;
return Is64Bit ? ABI_LP64S : ABI_ILP32S;
};
- if (ABIName.empty())
- errs() << "warning: the triple-implied ABI is invalid, ignoring and using "
- "feature-implied ABI\n";
- else
+ if (!ABIName.empty())
errs() << "warning: both target-abi and the triple-implied ABI are "
"invalid, ignoring and using feature-implied ABI\n";
return checkABIStandardized(GetFeatureABI());
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index ad8f5f0..7abe9c9 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -385,11 +385,12 @@ void MipsELFObjectWriter::sortRelocs(std::vector<ELFRelocationEntry> &Relocs) {
if (hasRelocationAddend())
return;
- // Sort relocations by the address they are applied to.
- llvm::sort(Relocs,
- [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
- return A.Offset < B.Offset;
- });
+ // Sort relocations by r_offset. There might be more than one at an offset
+ // with composed relocations or .reloc directives.
+ llvm::stable_sort(
+ Relocs, [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
+ return A.Offset < B.Offset;
+ });
// Place relocations in a list for reorder convenience. Hi16 contains the
// iterators of high-part relocations.
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index b89d689..feb4eb3 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1033,45 +1033,40 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
}
void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) {
- MCFragment *DF = getStreamer().getOrCreateDataFragment();
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- Mips::fixup_Mips_GPREL32));
- DF->appendContents(4, 0);
+ auto &S = getStreamer();
+ S.addFixup(Value, Mips::fixup_Mips_GPREL32);
+ S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
- MCFragment *DF = getStreamer().getOrCreateDataFragment();
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- Mips::fixup_Mips_GPREL32));
- DF->appendContents(8, 0);
+ auto &S = getStreamer();
+ // fixup_Mips_GPREL32 desginates R_MIPS_GPREL32+R_MIPS_64 on MIPS64.
+ S.addFixup(Value, Mips::fixup_Mips_GPREL32);
+ S.appendContents(8, 0);
}
void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) {
- MCFragment *DF = getStreamer().getOrCreateDataFragment();
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- Mips::fixup_Mips_DTPREL32));
- DF->appendContents(4, 0);
+ auto &S = getStreamer();
+ S.addFixup(Value, Mips::fixup_Mips_DTPREL32);
+ S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) {
- MCFragment *DF = getStreamer().getOrCreateDataFragment();
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- Mips::fixup_Mips_DTPREL64));
- DF->appendContents(8, 0);
+ auto &S = getStreamer();
+ S.addFixup(Value, Mips::fixup_Mips_DTPREL64);
+ S.appendContents(8, 0);
}
void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) {
- MCFragment *DF = getStreamer().getOrCreateDataFragment();
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- Mips::fixup_Mips_TPREL32));
- DF->appendContents(4, 0);
+ auto &S = getStreamer();
+ S.addFixup(Value, Mips::fixup_Mips_TPREL32);
+ S.appendContents(4, 0);
}
void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) {
- MCFragment *DF = getStreamer().getOrCreateDataFragment();
- DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
- Mips::fixup_Mips_TPREL64));
- DF->appendContents(8, 0);
+ auto &S = getStreamer();
+ S.addFixup(Value, Mips::fixup_Mips_TPREL64);
+ S.appendContents(8, 0);
}
void MipsTargetELFStreamer::emitDirectiveSetMicroMips() {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 77784be..ddcecc00 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -952,10 +952,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// promoted to f32. v2f16 is expanded to f16, which is then promoted
// to f32.
for (const auto &Op :
- {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
+ {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {
setOperationAction(Op, MVT::f16, Promote);
setOperationAction(Op, MVT::f32, Legal);
- setOperationAction(Op, MVT::f64, Legal);
+ // only div/rem/sqrt are legal for f64
+ if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
+ setOperationAction(Op, MVT::f64, Legal);
+ }
setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
setOperationAction(Op, MVT::bf16, Promote);
AddPromotedToType(Op, MVT::bf16, MVT::f32);
@@ -2068,6 +2071,8 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL,
SelectionDAG &DAG,
unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
+ assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
+ Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
{A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
}
@@ -4006,7 +4011,10 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
- case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
+ case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::v2i32;
Info.ptrVal = I.getArgOperand(0);
@@ -4029,6 +4037,30 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return true;
}
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::i32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOStore;
+ Info.align = Align(4);
+ return true;
+ }
+
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
+ case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::v4i32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOStore;
+ Info.align = Align(16);
+ return true;
+ }
+
case Intrinsic::nvvm_atomic_add_gen_f_cta:
case Intrinsic::nvvm_atomic_add_gen_f_sys:
case Intrinsic::nvvm_atomic_add_gen_i_cta:
@@ -5845,6 +5877,8 @@ static SDValue combineADDRSPACECAST(SDNode *N,
// details:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
+ assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
+
if (Mode == NVPTX::PTXPrmtMode::NONE)
return Selector;
@@ -5876,6 +5910,8 @@ static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
}
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
+ assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
+ Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
// {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
APInt BitField = B.concat(A);
APInt SelectorVal = getPRMTSelector(Selector, Mode);
@@ -6510,10 +6546,13 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known,
KnownBits BKnown = DAG.computeKnownBits(B, Depth);
// {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
+ assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
+ "PRMT must have i32 operands");
+ assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
KnownBits BitField = BKnown.concat(AKnown);
APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
- for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) {
+ for (unsigned I : llvm::seq(4)) {
APInt Sel = SelectorVal.extractBits(4, I * 4);
unsigned Idx = Sel.getLoBits(3).getZExtValue();
unsigned Sign = Sel.getHiBits(1).getZExtValue();
@@ -6537,3 +6576,102 @@ void NVPTXTargetLowering::computeKnownBitsForTargetNode(
break;
}
}
+
+static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
+ const APInt &DemandedBits) {
+ APInt DemandedLHS = APInt(32, 0);
+ APInt DemandedRHS = APInt(32, 0);
+
+ for (unsigned I : llvm::seq(4)) {
+ if (DemandedBits.extractBits(8, I * 8).isZero())
+ continue;
+
+ APInt Sel = SelectorVal.extractBits(4, I * 4);
+ unsigned Idx = Sel.getLoBits(3).getZExtValue();
+ unsigned Sign = Sel.getHiBits(1).getZExtValue();
+
+ APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
+ unsigned ByteStart = (Idx % 4) * 8;
+ if (Sign)
+ Src.setBit(ByteStart + 7);
+ else
+ Src.setBits(ByteStart, ByteStart + 8);
+ }
+
+ return {DemandedLHS, DemandedRHS};
+}
+
+// Replace undef with 0 as this is easier for other optimizations such as
+// known bits.
+static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG) {
+ if (!Op)
+ return SDValue();
+ if (Op.isUndef())
+ return DAG.getConstant(0, SDLoc(), MVT::i32);
+ return Op;
+}
+
+static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT,
+ const APInt &DemandedBits,
+ SelectionDAG &DAG,
+ const TargetLowering &TLI,
+ unsigned Depth) {
+ assert(PRMT.getOpcode() == NVPTXISD::PRMT);
+ SDValue Op0 = PRMT.getOperand(0);
+ SDValue Op1 = PRMT.getOperand(1);
+ auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
+ if (!SelectorConst)
+ return SDValue();
+
+ unsigned Mode = PRMT.getConstantOperandVal(3);
+ const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
+
+ // Try to simplify the PRMT to one of the inputs if the used bytes are all
+ // from the same input in the correct order.
+ const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
+ const unsigned SelBits = (4 - LeadingBytes) * 4;
+ if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
+ return Op0;
+ if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
+ return Op1;
+
+ auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
+
+ // Attempt to avoid multi-use ops if we don't need anything from them.
+ SDValue DemandedOp0 =
+ TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
+ SDValue DemandedOp1 =
+ TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
+
+ DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
+ DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
+ if ((DemandedOp0 && DemandedOp0 != Op0) ||
+ (DemandedOp1 && DemandedOp1 != Op1)) {
+ Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+ Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+ return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
+ }
+
+ return SDValue();
+}
+
+bool NVPTXTargetLowering::SimplifyDemandedBitsForTargetNode(
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
+ Known.resetAll();
+
+ switch (Op.getOpcode()) {
+ case NVPTXISD::PRMT:
+ if (SDValue Result = simplifyDemandedBitsForPRMT(Op, DemandedBits, TLO.DAG,
+ *this, Depth)) {
+ TLO.CombineTo(Op, Result);
+ return true;
+ }
+ break;
+ default:
+ break;
+ }
+
+ computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
+ return false;
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index bc3548c..228e2aa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -275,6 +275,11 @@ public:
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
+ bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits,
+ const APInt &DemandedElts,
+ KnownBits &Known,
+ TargetLoweringOpt &TLO,
+ unsigned Depth = 0) const override;
private:
const NVPTXSubtarget &STI; // cache the subtarget here
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b5df4c6..442b900 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1234,7 +1234,7 @@ defm FMA_F32 : FMA<F32RT, allow_ftz = true>;
defm FMA_F32x2 : FMA<F32X2RT, allow_ftz = true, preds = [hasF32x2Instructions]>;
defm FMA_F64 : FMA<F64RT, allow_ftz = false>;
-// sin/cos
+// sin/cos/tanh
class UnaryOpAllowsApproxFn<SDPatternOperator operator>
: PatFrag<(ops node:$A),
@@ -1250,6 +1250,10 @@ def COS_APPROX_f32 :
BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$src), (ins FTZFlag:$ftz),
"cos.approx$ftz.f32",
[(set f32:$dst, (UnaryOpAllowsApproxFn<fcos> f32:$src))]>;
+def TANH_APPROX_f32 :
+ BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "tanh.approx.f32",
+ [(set f32:$dst, (UnaryOpAllowsApproxFn<ftanh> f32:$src))]>,
+ Requires<[hasPTX<70>, hasSM<75>]>;
//-----------------------------------
// Bitwise operations
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index f329f48..0a00220 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -4758,7 +4758,14 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
!and(!eq(op, "ldmatrix"),
!eq(ptx_elt_type, "b8x16.b4x16_p64"),
- !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>]);
+ !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
+
+ !and(!eq(op, "stmatrix"),!eq(ptx_elt_type, "b16"),
+ !eq(geom, "m8n8")) : [hasSM<90>, hasPTX<78>],
+
+ !and(!eq(op, "stmatrix"),
+ !eq(ptx_elt_type, "b8"),
+ !eq(geom, "m16n8")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>]);
// template DAGs for instruction inputs/output.
dag Outs = !dag(outs, ptx_regs, reg_names);
@@ -5039,6 +5046,42 @@ defset list<WMMA_INSTR> LDMATRIXs = {
} // transposed
} // defset
+//
+// stmatrix.sync.aligned.m8n8[|.trans][|.shared].b16
+//
+class STMATRIX<WMMA_REGINFO Frag, bit Transposed, string Space>
+ : WMMA_INSTR<STMATRIX_NAME<Frag, Transposed>.record, [!con((ins ADDR:$dst), Frag.Ins)]>,
+ Requires<Frag.Predicates> {
+ // Build PatFrag that only matches particular address space.
+ dag PFOperands = !con((ops node:$dst),
+ !dag(ops, !listsplat(node, !size(Frag.regs)), Frag.reg_names));
+ PatFrag IntrFrag = PatFrag<PFOperands,
+ !foreach(tmp, PFOperands, !subst(ops, Intr, tmp)),
+ !cond(!eq(Space, ".shared"): AS_match.shared,
+ true: AS_match.generic)>;
+ // Build AS-constrained pattern.
+ let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret;
+ let OutOperandList = (outs);
+ let InOperandList = !con(Args, (ins MmaCode:$ptx));
+ let AsmString = "stmatrix.sync.aligned."
+ # Frag.geom
+ # "." # Frag.frag
+ # !if(Transposed, ".trans", "")
+ # Space
+ # "." # Frag.ptx_elt_type
+ # " [$dst], " # Frag.regstring # ";";
+}
+
+// Create all stmatrix variants
+defset list<WMMA_INSTR> STMATRIXs = {
+ foreach transposed = [false, true] in {foreach space = [".shared", ""] in {
+ foreach frag = NVVM_MMA_OPS.all_stmatrix_ops in
+ if NVVM_STMATRIX_SUPPORTED<frag, transposed>.ret then
+ def : STMATRIX<WMMA_REGINFO<frag, "stmatrix">, transposed, space>;
+ } // space
+ } // transposed
+} // defset
+
// Constructing non-flat DAGs is still a pain. I can't !subst a dag node with a
// dag, so the ptx.version must be appended *after* foreach replaces 'ins' with
// the instruction record.
@@ -5049,7 +5092,7 @@ class MMA_PAT<WMMA_INSTR wi>
Requires<wi.Predicates>;
// Build intrinsic->instruction patterns for all MMA instructions.
-foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs, LDMATRIXs) in
+foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs, LDMATRIXs, STMATRIXs) in
def : MMA_PAT<mma>;
multiclass MAPA<string suffix, Intrinsic Intr> {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 1ac91fa..80fac18 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -53,34 +53,30 @@ let Predicates = [IsISAFuture] in {
let Predicates = [HasVSX, IsISAFuture] in {
let mayLoad = 1 in {
- def LXVRL : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
- "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
-
- def LXVRLL : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
- "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
-
- def LXVPRL : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp),
- (ins memr:$RA, g8rc:$RB),
- "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
-
- def LXVPRLL : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp),
- (ins memr:$RA, g8rc:$RB),
- "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
+ def LXVRL
+ : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
+ "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
+ def LXVRLL
+ : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
+ "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
+ def LXVPRL
+ : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB),
+ "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
+ def LXVPRLL
+ : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB),
+ "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
}
let mayStore = 1 in {
- def STXVRL : XX1Form_memOp<31, 653, (outs),
- (ins vsrc:$XT, memr:$RA, g8rc:$RB),
- "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
-
- def STXVRLL : XX1Form_memOp<31, 685, (outs),
- (ins vsrc:$XT, memr:$RA, g8rc:$RB),
- "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
-
+ def STXVRL
+ : XX1Form_memOp<31, 653, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB),
+ "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
+ def STXVRLL
+ : XX1Form_memOp<31, 685, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB),
+ "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
def STXVPRL : XForm_XTp5_XAB5<31, 717, (outs),
(ins vsrprc:$XTp, memr:$RA, g8rc:$RB),
"stxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
-
def STXVPRLL : XForm_XTp5_XAB5<31, 749, (outs),
(ins vsrprc:$XTp, memr:$RA, g8rc:$RB),
"stxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 75a0272..996b6ef 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -171,7 +171,7 @@ void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
}
void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+ const SchedRegion &Region) const {
// The GenericScheduler that we use defaults to scheduling bottom up only.
// We want to schedule from both the top and the bottom and so we set
// OnlyBottomUp to false.
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 9a97d1a..3c59a47 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -240,7 +240,8 @@ public:
void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
+
bool useAA() const override;
bool enableSubRegLiveness() const override;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index f76f8b3..2c37c3b 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -302,6 +302,28 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst,
Inst = std::move(Res);
}
+// Check if an R_RISCV_ALIGN relocation is needed for an alignment directive.
+// If conditions are met, compute the padding size and create a fixup encoding
+// the padding size in the addend.
+bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
+ // Use default handling unless linker relaxation is enabled and the alignment
+ // is larger than the nop size.
+ const MCSubtargetInfo *STI = F.getSubtargetInfo();
+ if (!STI->hasFeature(RISCV::FeatureRelax))
+ return false;
+ unsigned MinNopLen = STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4;
+ if (F.getAlignment() <= MinNopLen)
+ return false;
+
+ Size = F.getAlignment().value() - MinNopLen;
+ auto *Expr = MCConstantExpr::create(Size, getContext());
+ MCFixup Fixup =
+ MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_RISCV_ALIGN);
+ F.setVarFixups({Fixup});
+ F.getParent()->setLinkerRelaxable();
+ return true;
+}
+
bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F,
bool &WasRelaxed) const {
MCContext &C = getContext();
@@ -637,7 +659,7 @@ bool RISCVAsmBackend::isPCRelFixupResolved(const MCSymbol *SymA,
// Otherwise, check if the offset between the symbol and fragment is fully
// resolved, unaffected by linker-relaxable fragments (e.g. instructions or
- // offset-affected MCAlignFragment). Complements the generic
+ // offset-affected FT_Align fragments). Complements the generic
// isSymbolRefDifferenceFullyResolvedImpl.
if (!PCRelTemp)
PCRelTemp = getContext().createTempSymbol();
@@ -887,55 +909,6 @@ void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
}
}
-// Linker relaxation may change code size. We have to insert Nops
-// for .align directive when linker relaxation enabled. So then Linker
-// could satisfy alignment by removing Nops.
-// The function return the total Nops Size we need to insert.
-bool RISCVAsmBackend::shouldInsertExtraNopBytesForCodeAlign(
- const MCAlignFragment &AF, unsigned &Size) {
- // Calculate Nops Size only when linker relaxation enabled.
- const MCSubtargetInfo *STI = AF.getSubtargetInfo();
- if (!STI->hasFeature(RISCV::FeatureRelax))
- return false;
-
- unsigned MinNopLen = STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4;
-
- if (AF.getAlignment() <= MinNopLen) {
- return false;
- } else {
- Size = AF.getAlignment().value() - MinNopLen;
- return true;
- }
-}
-
-// We need to insert R_RISCV_ALIGN relocation type to indicate the
-// position of Nops and the total bytes of the Nops have been inserted
-// when linker relaxation enabled.
-// The function insert fixup_riscv_align fixup which eventually will
-// transfer to R_RISCV_ALIGN relocation type.
-bool RISCVAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm,
- MCAlignFragment &AF) {
- // Insert the fixup only when linker relaxation enabled.
- const MCSubtargetInfo *STI = AF.getSubtargetInfo();
- if (!STI->hasFeature(RISCV::FeatureRelax))
- return false;
-
- // Calculate total Nops we need to insert. If there are none to insert
- // then simply return.
- unsigned Count;
- if (!shouldInsertExtraNopBytesForCodeAlign(AF, Count) || (Count == 0))
- return false;
-
- MCContext &Ctx = getContext();
- const MCExpr *Dummy = MCConstantExpr::create(0, Ctx);
- MCFixup Fixup = MCFixup::create(0, Dummy, ELF::R_RISCV_ALIGN);
-
- uint64_t FixedValue = 0;
- MCValue NopBytes = MCValue::get(Count);
- Asm.getWriter().recordRelocation(AF, Fixup, NopBytes, FixedValue);
- return true;
-}
-
std::unique_ptr<MCObjectTargetWriter>
RISCVAsmBackend::createObjectTargetWriter() const {
return createRISCVELFObjectWriter(OSABI, Is64Bit);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index 8c10fbe..d97d632 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -38,14 +38,6 @@ public:
const MCTargetOptions &Options);
~RISCVAsmBackend() override = default;
- // Return Size with extra Nop Bytes for alignment directive in code section.
- bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF,
- unsigned &Size) override;
-
- // Insert target specific fixup type for alignment directive in code section.
- bool shouldInsertFixupForCodeAlign(MCAssembler &Asm,
- MCAlignFragment &AF) override;
-
std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &,
uint64_t &) override;
bool addReloc(const MCFragment &, const MCFixup &, const MCValue &,
@@ -73,6 +65,7 @@ public:
void relaxInstruction(MCInst &Inst,
const MCSubtargetInfo &STI) const override;
+ bool relaxAlign(MCFragment &F, unsigned &Size) override;
bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override;
bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override;
std::pair<bool, bool> relaxLEB128(MCFragment &LF,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index aeda5ac..5abb546 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -52,15 +52,6 @@ namespace RISCV {
#include "RISCVGenSearchableTables.inc"
} // namespace RISCV
-// Report an error but don't ask the user to report a bug.
-// TODO: Remove these wrappers.
-[[noreturn]] static void reportError(const char *Reason) {
- reportFatalUsageError(Reason);
-}
-[[noreturn]] static void reportError(Error Err) {
- reportFatalUsageError(std::move(Err));
-}
-
namespace RISCVABI {
ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
StringRef ABIName) {
@@ -97,7 +88,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
if ((TargetABI == RISCVABI::ABI::ABI_ILP32E ||
(TargetABI == ABI_Unknown && IsRVE && !IsRV64)) &&
FeatureBits[RISCV::FeatureStdExtD])
- reportError("ILP32E cannot be used with the D ISA extension");
+ reportFatalUsageError("ILP32E cannot be used with the D ISA extension");
if (TargetABI != ABI_Unknown)
return TargetABI;
@@ -105,7 +96,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
// If no explicit ABI is given, try to compute the default ABI.
auto ISAInfo = RISCVFeatures::parseFeatureBits(IsRV64, FeatureBits);
if (!ISAInfo)
- reportError(ISAInfo.takeError());
+ reportFatalUsageError(ISAInfo.takeError());
return getTargetABI((*ISAInfo)->computeDefaultABI());
}
@@ -137,12 +128,12 @@ namespace RISCVFeatures {
void validate(const Triple &TT, const FeatureBitset &FeatureBits) {
if (TT.isArch64Bit() && !FeatureBits[RISCV::Feature64Bit])
- reportError("RV64 target requires an RV64 CPU");
+ reportFatalUsageError("RV64 target requires an RV64 CPU");
if (!TT.isArch64Bit() && !FeatureBits[RISCV::Feature32Bit])
- reportError("RV32 target requires an RV32 CPU");
+ reportFatalUsageError("RV32 target requires an RV32 CPU");
if (FeatureBits[RISCV::Feature32Bit] &&
FeatureBits[RISCV::Feature64Bit])
- reportError("RV32 and RV64 can't be combined");
+ reportFatalUsageError("RV32 and RV64 can't be combined");
}
llvm::Expected<std::unique_ptr<RISCVISAInfo>>
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index baa508a..269b117 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -13,13 +13,7 @@
#include "MCTargetDesc/RISCVAsmBackend.h"
#include "MCTargetDesc/RISCVMCAsmInfo.h"
-#include "RISCVFixupKinds.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td
index cbf039e..4c303a9 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.td
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td
@@ -56,19 +56,21 @@ def CSR_XLEN_F32_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
(sequence "F%u_D", 0, 31))>;
+defvar VREGS = (add (sequence "V%u", 0, 31),
+ (sequence "V%uM2", 0, 31, 2),
+ (sequence "V%uM4", 0, 31, 4),
+ (sequence "V%uM8", 0, 31, 8));
+
// Same as CSR_Interrupt, but including all vector registers.
-def CSR_XLEN_V_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
- (sequence "V%u", 0, 31))>;
+def CSR_XLEN_V_Interrupt: CalleeSavedRegs<(add CSR_Interrupt, VREGS)>;
// Same as CSR_Interrupt, but including all 32-bit FP registers and all vector
// registers.
-def CSR_XLEN_F32_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F32_Interrupt,
- (sequence "V%u", 0, 31))>;
+def CSR_XLEN_F32_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F32_Interrupt, VREGS)>;
// Same as CSR_Interrupt, but including all 64-bit FP registers and all vector
// registers.
-def CSR_XLEN_F64_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F64_Interrupt,
- (sequence "V%u", 0, 31))>;
+def CSR_XLEN_F64_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F64_Interrupt, VREGS)>;
// Same as CSR_Interrupt, but excluding X16-X31.
def CSR_Interrupt_RVE : CalleeSavedRegs<(sub CSR_Interrupt,
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index f9c0b54..171940e 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1272,7 +1272,7 @@ def FeatureVendorXSfmm128t
def FeatureVendorXSfvqmaccdod
: RISCVExtension<1, 0,
"SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2)",
- [FeatureStdExtZve32x]>;
+ [FeatureStdExtZve32x, FeatureStdExtZvl128b]>;
def HasVendorXSfvqmaccdod
: Predicate<"Subtarget->hasVendorXSfvqmaccdod()">,
AssemblerPredicate<(all_of FeatureVendorXSfvqmaccdod),
@@ -1281,7 +1281,7 @@ def HasVendorXSfvqmaccdod
def FeatureVendorXSfvqmaccqoq
: RISCVExtension<1, 0,
"SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4)",
- [FeatureStdExtZve32x]>;
+ [FeatureStdExtZve32x, FeatureStdExtZvl256b]>;
def HasVendorXSfvqmaccqoq
: Predicate<"Subtarget->hasVendorXSfvqmaccqoq()">,
AssemblerPredicate<(all_of FeatureVendorXSfvqmaccqoq),
@@ -1290,7 +1290,7 @@ def HasVendorXSfvqmaccqoq
def FeatureVendorXSfvfwmaccqqq
: RISCVExtension<1, 0,
"SiFive Matrix Multiply Accumulate Instruction (4-by-4)",
- [FeatureStdExtZvfbfmin]>;
+ [FeatureStdExtZvfbfmin, FeatureStdExtZvl128b]>;
def HasVendorXSfvfwmaccqqq
: Predicate<"Subtarget->hasVendorXSfvfwmaccqqq()">,
AssemblerPredicate<(all_of FeatureVendorXSfvfwmaccqqq),
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 23b4554..b1ab76a 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1544,10 +1544,53 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
return Offset;
}
+static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI,
+ const Register &Reg) {
+ MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0);
+ // If it's not a grouped vector register, it doesn't have subregister, so
+ // the base register is just itself.
+ if (BaseReg == RISCV::NoRegister)
+ BaseReg = Reg;
+ return BaseReg;
+}
+
void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedRegs,
RegScavenger *RS) const {
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+ // In TargetFrameLowering::determineCalleeSaves, any vector register is marked
+ // as saved if any of its subregister is clobbered, this is not correct in
+ // vector registers. We only want the vector register to be marked as saved
+ // if all of its subregisters are clobbered.
+ // For example:
+ // Original behavior: If v24 is marked, v24m2, v24m4, v24m8 are also marked.
+ // Correct behavior: v24m2 is marked only if v24 and v25 are marked.
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+ const RISCVRegisterInfo &TRI = *STI.getRegisterInfo();
+ for (unsigned i = 0; CSRegs[i]; ++i) {
+ unsigned CSReg = CSRegs[i];
+ // Only vector registers need special care.
+ if (!RISCV::VRRegClass.contains(getRVVBaseRegister(TRI, CSReg)))
+ continue;
+
+ SavedRegs.reset(CSReg);
+
+ auto SubRegs = TRI.subregs(CSReg);
+ // Set the register and all its subregisters.
+ if (!MRI.def_empty(CSReg) || MRI.getUsedPhysRegsMask().test(CSReg)) {
+ SavedRegs.set(CSReg);
+ llvm::for_each(SubRegs, [&](unsigned Reg) { return SavedRegs.set(Reg); });
+ }
+
+ // Combine to super register if all of its subregisters are marked.
+ if (!SubRegs.empty() && llvm::all_of(SubRegs, [&](unsigned Reg) {
+ return SavedRegs.test(Reg);
+ }))
+ SavedRegs.set(CSReg);
+ }
+
// Unconditionally spill RA and FP only if the function uses a frame
// pointer.
if (hasFP(MF)) {
@@ -2137,16 +2180,6 @@ static unsigned getCalleeSavedRVVNumRegs(const Register &BaseReg) {
: 8;
}
-static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI,
- const Register &Reg) {
- MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0);
- // If it's not a grouped vector register, it doesn't have subregister, so
- // the base register is just itself.
- if (BaseReg == RISCV::NoRegister)
- BaseReg = Reg;
- return BaseReg;
-}
-
void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, bool HasFP) const {
MachineFunction *MF = MBB.getParent();
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index cfec46d2..34910b7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3032,6 +3032,63 @@ bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base,
return true;
}
+/// Return true if this a load/store that we have a RegRegScale instruction for.
+static bool isRegRegScaleLoadOrStore(SDNode *User, SDValue Add,
+ const RISCVSubtarget &Subtarget) {
+ if (User->getOpcode() != ISD::LOAD && User->getOpcode() != ISD::STORE)
+ return false;
+ EVT VT = cast<MemSDNode>(User)->getMemoryVT();
+ if (!(VT.isScalarInteger() &&
+ (Subtarget.hasVendorXTHeadMemIdx() || Subtarget.hasVendorXqcisls())) &&
+ !((VT == MVT::f32 || VT == MVT::f64) &&
+ Subtarget.hasVendorXTHeadFMemIdx()))
+ return false;
+ // Don't allow stores of the value. It must be used as the address.
+ if (User->getOpcode() == ISD::STORE &&
+ cast<StoreSDNode>(User)->getValue() == Add)
+ return false;
+
+ return true;
+}
+
+/// Is it profitable to fold this Add into RegRegScale load/store. If \p
+/// Shift is non-null, then we have matched a shl+add. We allow reassociating
+/// (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2)) if there is a
+/// single addi and we don't have a SHXADD instruction we could use.
+/// FIXME: May still need to check how many and what kind of users the SHL has.
+static bool isWorthFoldingIntoRegRegScale(const RISCVSubtarget &Subtarget,
+ SDValue Add,
+ SDValue Shift = SDValue()) {
+ bool FoundADDI = false;
+ for (auto *User : Add->users()) {
+ if (isRegRegScaleLoadOrStore(User, Add, Subtarget))
+ continue;
+
+ // Allow a single ADDI that is used by loads/stores if we matched a shift.
+ if (!Shift || FoundADDI || User->getOpcode() != ISD::ADD ||
+ !isa<ConstantSDNode>(User->getOperand(1)) ||
+ !isInt<12>(cast<ConstantSDNode>(User->getOperand(1))->getSExtValue()))
+ return false;
+
+ FoundADDI = true;
+
+ // If we have a SHXADD instruction, prefer that over reassociating an ADDI.
+ assert(Shift.getOpcode() == ISD::SHL);
+ unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+ if ((ShiftAmt <= 3 &&
+ (Subtarget.hasStdExtZba() || Subtarget.hasVendorXTHeadBa())) ||
+ (ShiftAmt >= 4 && ShiftAmt <= 7 && Subtarget.hasVendorXqciac()))
+ return false;
+
+ // All users of the ADDI should be load/store.
+ for (auto *ADDIUser : User->users())
+ if (!isRegRegScaleLoadOrStore(ADDIUser, SDValue(User, 0), Subtarget))
+ return false;
+ }
+
+ return true;
+}
+
bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
unsigned MaxShiftAmount,
SDValue &Base, SDValue &Index,
@@ -3062,7 +3119,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
if (LHS.getOpcode() == ISD::ADD &&
!isa<ConstantSDNode>(LHS.getOperand(1)) &&
isInt<12>(C1->getSExtValue())) {
- if (SelectShl(LHS.getOperand(1), Index, Scale)) {
+ if (SelectShl(LHS.getOperand(1), Index, Scale) &&
+ isWorthFoldingIntoRegRegScale(*Subtarget, LHS, LHS.getOperand(1))) {
SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
SDLoc(Addr), VT);
Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
@@ -3072,7 +3130,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
}
// Add is commutative so we need to check both operands.
- if (SelectShl(LHS.getOperand(0), Index, Scale)) {
+ if (SelectShl(LHS.getOperand(0), Index, Scale) &&
+ isWorthFoldingIntoRegRegScale(*Subtarget, LHS, LHS.getOperand(0))) {
SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
SDLoc(Addr), VT);
Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
@@ -3090,22 +3149,48 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
// Try to match a shift on the RHS.
if (SelectShl(RHS, Index, Scale)) {
+ if (!isWorthFoldingIntoRegRegScale(*Subtarget, Addr, RHS))
+ return false;
Base = LHS;
return true;
}
// Try to match a shift on the LHS.
if (SelectShl(LHS, Index, Scale)) {
+ if (!isWorthFoldingIntoRegRegScale(*Subtarget, Addr, LHS))
+ return false;
Base = RHS;
return true;
}
+ if (!isWorthFoldingIntoRegRegScale(*Subtarget, Addr))
+ return false;
+
Base = LHS;
Index = RHS;
Scale = CurDAG->getTargetConstant(0, SDLoc(Addr), VT);
return true;
}
+bool RISCVDAGToDAGISel::SelectAddrRegZextRegScale(SDValue Addr,
+ unsigned MaxShiftAmount,
+ unsigned Bits, SDValue &Base,
+ SDValue &Index,
+ SDValue &Scale) {
+ if (!SelectAddrRegRegScale(Addr, MaxShiftAmount, Base, Index, Scale))
+ return false;
+
+ if (Index.getOpcode() == ISD::AND) {
+ auto *C = dyn_cast<ConstantSDNode>(Index.getOperand(1));
+ if (C && C->getZExtValue() == maskTrailingOnes<uint64_t>(Bits)) {
+ Index = Index.getOperand(0);
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool RISCVDAGToDAGISel::SelectAddrRegReg(SDValue Addr, SDValue &Base,
SDValue &Offset) {
if (Addr.getOpcode() != ISD::ADD)
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 72e2f96..ee3a86e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -59,19 +59,14 @@ public:
return SelectAddrRegRegScale(Addr, MaxShift, Base, Index, Scale);
}
+ bool SelectAddrRegZextRegScale(SDValue Addr, unsigned MaxShiftAmount,
+ unsigned Bits, SDValue &Base, SDValue &Index,
+ SDValue &Scale);
+
template <unsigned MaxShift, unsigned Bits>
bool SelectAddrRegZextRegScale(SDValue Addr, SDValue &Base, SDValue &Index,
SDValue &Scale) {
- if (SelectAddrRegRegScale(Addr, MaxShift, Base, Index, Scale)) {
- if (Index.getOpcode() == ISD::AND) {
- auto *C = dyn_cast<ConstantSDNode>(Index.getOperand(1));
- if (C && C->getZExtValue() == maskTrailingOnes<uint64_t>(Bits)) {
- Index = Index.getOperand(0);
- return true;
- }
- }
- }
- return false;
+ return SelectAddrRegZextRegScale(Addr, MaxShift, Bits, Base, Index, Scale);
}
bool SelectAddrRegReg(SDValue Addr, SDValue &Base, SDValue &Offset);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4845a9c..54845e5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1618,6 +1618,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
}
}
+ // Customize load and store operation for bf16 if zfh isn't enabled.
+ if (Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh()) {
+ setOperationAction(ISD::LOAD, MVT::bf16, Custom);
+ setOperationAction(ISD::STORE, MVT::bf16, Custom);
+ }
+
// Function alignments.
const Align FunctionAlignment(Subtarget.hasStdExtZca() ? 2 : 4);
setMinFunctionAlignment(FunctionAlignment);
@@ -2319,6 +2325,10 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
if (getLegalZfaFPImm(Imm, VT) >= 0)
return true;
+ // Some constants can be produced by fli+fneg.
+ if (Imm.isNegative() && getLegalZfaFPImm(-Imm, VT) >= 0)
+ return true;
+
// Cannot create a 64 bit floating-point immediate value for rv32.
if (Subtarget.getXLen() < VT.getScalarSizeInBits()) {
// td can handle +0.0 or -0.0 already.
@@ -7212,6 +7222,47 @@ static SDValue SplitStrictFPVectorOp(SDValue Op, SelectionDAG &DAG) {
return DAG.getMergeValues({V, HiRes.getValue(1)}, DL);
}
+SDValue
+RISCVTargetLowering::lowerXAndesBfHCvtBFloat16Load(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh() &&
+ "Unexpected bfloat16 load lowering");
+
+ SDLoc DL(Op);
+ LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
+ EVT MemVT = LD->getMemoryVT();
+ SDValue Load = DAG.getExtLoad(
+ ISD::ZEXTLOAD, DL, Subtarget.getXLenVT(), LD->getChain(),
+ LD->getBasePtr(),
+ EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
+ LD->getMemOperand());
+ // Using mask to make bf16 nan-boxing valid when we don't have flh
+ // instruction. -65536 would be treat as a small number and thus it can be
+ // directly used lui to get the constant.
+ SDValue mask = DAG.getSignedConstant(-65536, DL, Subtarget.getXLenVT());
+ SDValue OrSixteenOne =
+ DAG.getNode(ISD::OR, DL, Load.getValueType(), {Load, mask});
+ SDValue ConvertedResult =
+ DAG.getNode(RISCVISD::NDS_FMV_BF16_X, DL, MVT::bf16, OrSixteenOne);
+ return DAG.getMergeValues({ConvertedResult, Load.getValue(1)}, DL);
+}
+
+SDValue
+RISCVTargetLowering::lowerXAndesBfHCvtBFloat16Store(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh() &&
+ "Unexpected bfloat16 store lowering");
+
+ StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+ SDLoc DL(Op);
+ SDValue FMV = DAG.getNode(RISCVISD::NDS_FMV_X_ANYEXTBF16, DL,
+ Subtarget.getXLenVT(), ST->getValue());
+ return DAG.getTruncStore(
+ ST->getChain(), DL, FMV, ST->getBasePtr(),
+ EVT::getIntegerVT(*DAG.getContext(), ST->getMemoryVT().getSizeInBits()),
+ ST->getMemOperand());
+}
+
SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -7910,6 +7961,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return DAG.getMergeValues({Pair, Chain}, DL);
}
+ if (VT == MVT::bf16)
+ return lowerXAndesBfHCvtBFloat16Load(Op, DAG);
+
// Handle normal vector tuple load.
if (VT.isRISCVVectorTuple()) {
SDLoc DL(Op);
@@ -7936,7 +7990,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
BasePtr, MachinePointerInfo(Load->getAddressSpace()), Align(8));
OutChains.push_back(LoadVal.getValue(1));
Ret = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VT, Ret, LoadVal,
- DAG.getVectorIdxConstant(i, DL));
+ DAG.getTargetConstant(i, DL, MVT::i32));
BasePtr = DAG.getNode(ISD::ADD, DL, XLenVT, BasePtr, VROffset, Flag);
}
return DAG.getMergeValues(
@@ -7994,6 +8048,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
{Store->getChain(), Lo, Hi, Store->getBasePtr()}, MVT::i64,
Store->getMemOperand());
}
+
+ if (VT == MVT::bf16)
+ return lowerXAndesBfHCvtBFloat16Store(Op, DAG);
+
// Handle normal vector tuple store.
if (VT.isRISCVVectorTuple()) {
SDLoc DL(Op);
@@ -8015,9 +8073,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
// Extract subregisters in a vector tuple and store them individually.
for (unsigned i = 0; i < NF; ++i) {
- auto Extract = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL,
- MVT::getScalableVectorVT(MVT::i8, NumElts),
- StoredVal, DAG.getVectorIdxConstant(i, DL));
+ auto Extract =
+ DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL,
+ MVT::getScalableVectorVT(MVT::i8, NumElts), StoredVal,
+ DAG.getTargetConstant(i, DL, MVT::i32));
Ret = DAG.getStore(Chain, DL, Extract, BasePtr,
MachinePointerInfo(Store->getAddressSpace()),
Store->getBaseAlign(),
@@ -10934,9 +10993,9 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Load->getMemoryVT(), Load->getMemOperand());
SmallVector<SDValue, 9> Results;
for (unsigned int RetIdx = 0; RetIdx < NF; RetIdx++) {
- SDValue SubVec =
- DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, ContainerVT,
- Result.getValue(0), DAG.getVectorIdxConstant(RetIdx, DL));
+ SDValue SubVec = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, ContainerVT,
+ Result.getValue(0),
+ DAG.getTargetConstant(RetIdx, DL, MVT::i32));
Results.push_back(convertFromScalableVector(VT, SubVec, DAG, Subtarget));
}
Results.push_back(Result.getValue(1));
@@ -11023,7 +11082,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
convertToScalableVector(
ContainerVT, FixedIntrinsic->getOperand(2 + i), DAG, Subtarget),
- DAG.getVectorIdxConstant(i, DL));
+ DAG.getTargetConstant(i, DL, MVT::i32));
SDValue Ops[] = {
FixedIntrinsic->getChain(),
@@ -12027,7 +12086,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
for (unsigned i = 0U; i < Factor; ++i)
Res[i] = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, VecVT, Load,
- DAG.getVectorIdxConstant(i, DL));
+ DAG.getTargetConstant(i, DL, MVT::i32));
return DAG.getMergeValues(Res, DL);
}
@@ -12124,8 +12183,9 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op,
SDValue StoredVal = DAG.getUNDEF(VecTupTy);
for (unsigned i = 0; i < Factor; i++)
- StoredVal = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
- Op.getOperand(i), DAG.getConstant(i, DL, XLenVT));
+ StoredVal =
+ DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
+ Op.getOperand(i), DAG.getTargetConstant(i, DL, MVT::i32));
SDValue Ops[] = {DAG.getEntryNode(),
DAG.getTargetConstant(IntrIds[Factor - 2], DL, XLenVT),
@@ -16073,7 +16133,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
uint64_t MulAmt = CNode->getZExtValue();
// Don't do this if the Xqciac extension is enabled and the MulAmt in simm12.
- if (Subtarget.hasVendorXqciac() && isInt<12>(MulAmt))
+ if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue()))
return SDValue();
const bool HasShlAdd = Subtarget.hasStdExtZba() ||
@@ -16178,10 +16238,12 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
// 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x))
for (uint64_t Offset : {3, 5, 9}) {
if (isPowerOf2_64(MulAmt + Offset)) {
+ unsigned ShAmt = Log2_64(MulAmt + Offset);
+ if (ShAmt >= VT.getSizeInBits())
+ continue;
SDLoc DL(N);
SDValue Shift1 =
- DAG.getNode(ISD::SHL, DL, VT, X,
- DAG.getConstant(Log2_64(MulAmt + Offset), DL, VT));
+ DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT));
SDValue Mul359 =
DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(Log2_64(Offset - 1), DL, VT), X);
@@ -20690,7 +20752,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
SDValue Result = DAG.getUNDEF(VT);
for (unsigned i = 0; i < NF; ++i)
Result = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VT, Result, Splat,
- DAG.getVectorIdxConstant(i, DL));
+ DAG.getTargetConstant(i, DL, MVT::i32));
return Result;
}
// If this is a bitcast between a MVT::v4i1/v2i1/v1i1 and an illegal integer
@@ -24014,7 +24076,7 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts(
#endif
Val = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, PartVT, DAG.getUNDEF(PartVT),
- Val, DAG.getVectorIdxConstant(0, DL));
+ Val, DAG.getTargetConstant(0, DL, MVT::i32));
Parts[0] = Val;
return true;
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index e0a8c07..ca70c46 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -434,7 +434,8 @@ public:
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+ ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
@@ -444,9 +445,6 @@ public:
Instruction *Store, Value *Mask,
ArrayRef<Value *> InterleaveValues) const override;
- bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
- ArrayRef<Value *> InterleaveOps) const override;
-
bool supportKCFIBundles() const override { return true; }
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
@@ -580,6 +578,9 @@ private:
SDValue lowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerXAndesBfHCvtBFloat16Load(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerXAndesBfHCvtBFloat16Store(SDValue Op, SelectionDAG &DAG) const;
+
bool isEligibleForTailCallOptimization(
CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
const SmallVector<CCValAssign, 16> &ArgLocs) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index e23001a..d9c6101 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -174,6 +174,7 @@ class EltDeps<bit vl, bit mask> {
def EltDepsNone : EltDeps<vl=0, mask=0>;
def EltDepsVL : EltDeps<vl=1, mask=0>;
+def EltDepsMask : EltDeps<vl=0, mask=1>;
def EltDepsVLMask : EltDeps<vl=1, mask=1>;
class EEW <bits<2> val> {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index aef410f..dd365cf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -44,67 +44,86 @@ def simm10_unsigned : RISCVOp {
//===----------------------------------------------------------------------===//
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnaryImm10<bits<7> funct7, string opcodestr,
- DAGOperand TyImm10 = simm10>
- : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins TyImm10:$imm10),
- opcodestr, "$rd, $imm10"> {
+class PLI_i<bits<7> funct7, string opcodestr>
+ : RVInst<(outs GPR:$rd), (ins simm10:$imm10), opcodestr, "$rd, $imm10", [],
+ InstFormatOther> {
bits<10> imm10;
+ bits<5> rd;
let Inst{31-25} = funct7;
let Inst{24-16} = imm10{8-0};
let Inst{15} = imm10{9};
+ let Inst{14-12} = 0b010;
+ let Inst{11-7} = rd;
+ let Inst{6-0} = OPC_OP_IMM_32.Value;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnaryImm8<bits<8> funct8, string opcodestr>
- : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins uimm8:$uimm8),
- opcodestr, "$rd, $uimm8"> {
+class PLUI_i<bits<7> funct7, string opcodestr>
+ : RVInst<(outs GPR:$rd), (ins simm10_unsigned:$imm10), opcodestr,
+ "$rd, $imm10", [], InstFormatOther> {
+ bits<10> imm10;
+ bits<5> rd;
+
+ let Inst{31-25} = funct7;
+ let Inst{24} = imm10{0};
+ let Inst{23-15} = imm10{9-1};
+ let Inst{14-12} = 0b010;
+ let Inst{11-7} = rd;
+ let Inst{6-0} = OPC_OP_IMM_32.Value;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class PLI_B_i<bits<8> funct8, string opcodestr>
+ : RVInst<(outs GPR:$rd), (ins uimm8:$uimm8), opcodestr, "$rd, $uimm8", [],
+ InstFormatOther> {
bits<8> uimm8;
+ bits<5> rd;
let Inst{31-24} = funct8;
let Inst{23-16} = uimm8;
let Inst{15} = 0b0;
+ let Inst{14-12} = 0b010;
+ let Inst{11-7} = rd;
+ let Inst{6-0} = OPC_OP_IMM_32.Value;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnary<bits<3> f, string opcodestr, dag operands, string argstr>
- : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), operands, opcodestr, argstr> {
- bits<5> imm;
- bits<5> rs1;
-
+class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType>
+ : RVInstIBase<funct3, OPC_OP_IMM_32, (outs GPR:$rd),
+ (ins GPR:$rs1, ImmType:$shamt), opcodestr,
+ "$rd, $rs1, $shamt"> {
let Inst{31} = 0b1;
let Inst{30-28} = f;
let Inst{27} = 0b0;
- let Inst{19-15} = rs1;
}
-class RVPUnaryImm5<bits<3> f, string opcodestr>
- : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm5:$uimm5), "$rd, $rs1, $uimm5"> {
- bits<5> uimm5;
+class RVPShiftW_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm5> {
+ bits<5> shamt;
- let imm = uimm5;
let Inst{26-25} = 0b01;
- let Inst{24-20} = uimm5;
+ let Inst{24-20} = shamt;
}
-class RVPUnaryImm4<bits<3> f, string opcodestr>
- : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm4:$uimm4), "$rd, $rs1, $uimm4"> {
- bits<4> uimm4;
+class RVPShiftH_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm4> {
+ bits<4> shamt;
let Inst{26-24} = 0b001;
- let Inst{23-20} = uimm4;
+ let Inst{23-20} = shamt;
}
-class RVPUnaryImm3<bits<3> f, string opcodestr>
- : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm3:$uimm3), "$rd, $rs1, $uimm3"> {
- bits<3> uimm3;
+class RVPShiftB_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm3> {
+ bits<3> shamt;
let Inst{26-23} = 0b0001;
- let Inst{22-20} = uimm3;
+ let Inst{22-20} = shamt;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnaryWUF<bits<2> w, bits<5> uf, string opcodestr>
+class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
: RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins GPR:$rs1),
opcodestr, "$rd, $rs1"> {
let Inst{31-27} = 0b11100;
@@ -132,36 +151,36 @@ def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in {
-def PSLLI_B : RVPUnaryImm3<0b000, "pslli.b">;
-def PSLLI_H : RVPUnaryImm4<0b000, "pslli.h">;
-def PSSLAI_H : RVPUnaryImm4<0b101, "psslai.h">;
+def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">;
+def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">;
+def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">;
} // Predicates = [HasStdExtP]
let DecoderNamespace = "RV32Only",
Predicates = [HasStdExtP, IsRV32] in
-def SSLAI : RVPUnaryImm5<0b101, "sslai">;
+def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">;
let Predicates = [HasStdExtP, IsRV64] in {
-def PSLLI_W : RVPUnaryImm5<0b000, "pslli.w">;
-def PSSLAI_W : RVPUnaryImm5<0b101, "psslai.w">;
+def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">;
+def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
-def PLI_H : RVPUnaryImm10<0b1011000, "pli.h">;
+def PLI_H : PLI_i<0b1011000, "pli.h">;
let Predicates = [HasStdExtP, IsRV64] in
-def PLI_W : RVPUnaryImm10<0b1011001, "pli.w">;
+def PLI_W : PLI_i<0b1011001, "pli.w">;
let Predicates = [HasStdExtP] in
-def PLI_B : RVPUnaryImm8<0b10110100, "pli.b">;
+def PLI_B : PLI_B_i<0b10110100, "pli.b">;
let Predicates = [HasStdExtP] in {
-def PSEXT_H_B : RVPUnaryWUF<0b00, 0b00100, "psext.h.b">;
-def PSABS_H : RVPUnaryWUF<0b00, 0b00111, "psabs.h">;
-def PSABS_B : RVPUnaryWUF<0b10, 0b00111, "psabs.b">;
+def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">;
+def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">;
+def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">;
} // Predicates = [HasStdExtP]
let Predicates = [HasStdExtP, IsRV64] in {
-def PSEXT_W_B : RVPUnaryWUF<0b01, 0b00100, "psext.w.b">;
-def PSEXT_W_H : RVPUnaryWUF<0b01, 0b00101, "psext.w.h">;
+def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">;
+def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
-def PLUI_H : RVPUnaryImm10<0b1111000, "plui.h", simm10_unsigned>;
+def PLUI_H : PLUI_i<0b1111000, "plui.h">;
let Predicates = [HasStdExtP, IsRV64] in
-def PLUI_W : RVPUnaryImm10<0b1111001, "plui.w", simm10_unsigned>;
+def PLUI_W : PLUI_i<0b1111001, "plui.w">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 5d13a87..33c7138 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -1642,7 +1642,7 @@ def VFIRST_M : RVInstV<0b010000, 0b10001, OPMVV, (outs GPR:$vd),
def : MnemonicAlias<"vpopc.m", "vcpop.m">;
-let Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsVLMask in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsMask in {
let DestEEW = EEW1 in {
// vmsbf.m set-before-first mask bit
@@ -1655,7 +1655,7 @@ defm VMSOF_M : VMSFS_MV_V<"vmsof.m", 0b010100, 0b00010>;
// Vector Iota Instruction
defm VIOTA_M : VIOTA_MV_V<"viota.m", 0b010100, 0b10000>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsVLMask
+} // Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsMask
// Vector Element Index Instruction
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index de9e55b..6afc942d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -543,7 +543,8 @@ defset list<VTypeInfoToWide> AllWidenableBFloatToFloatVectors = {
// This represents the information we need in codegen for each pseudo.
// The definition should be consistent with `struct PseudoInfo` in
// RISCVInstrInfo.h.
-class RISCVVPseudo {
+class RISCVVPseudo<dag outs, dag ins, list<dag> pattern = [], string opcodestr = "", string argstr = "">
+ : Pseudo<outs, ins, pattern, opcodestr, argstr> {
Pseudo Pseudo = !cast<Pseudo>(NAME); // Used as a key.
Instruction BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
// SEW = 0 is used to denote that the Pseudo is not SEW specific (or unknown).
@@ -785,10 +786,9 @@ class GetVTypeMinimalPredicates<VTypeInfo vti> {
class VPseudoUSLoadNoMask<VReg RetClass,
int EEW,
DAGOperand sewop = sew> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl, sewop:$sew,
- vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl,
+ sewop:$sew, vec_policy:$policy)>,
RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -801,11 +801,10 @@ class VPseudoUSLoadNoMask<VReg RetClass,
class VPseudoUSLoadMask<VReg RetClass,
int EEW> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew,
+ vec_policy:$policy)>,
RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -820,10 +819,9 @@ class VPseudoUSLoadMask<VReg RetClass,
class VPseudoUSLoadFFNoMask<VReg RetClass,
int EEW> :
- Pseudo<(outs RetClass:$rd, GPR:$vl),
- (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd, GPR:$vl),
+ (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl,
+ sew:$sew, vec_policy:$policy)>,
RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -836,11 +834,10 @@ class VPseudoUSLoadFFNoMask<VReg RetClass,
class VPseudoUSLoadFFMask<VReg RetClass,
int EEW> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$avl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew,
+ vec_policy:$policy)>,
RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -855,10 +852,9 @@ class VPseudoUSLoadFFMask<VReg RetClass,
class VPseudoSLoadNoMask<VReg RetClass,
int EEW> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$dest, GPRMemZeroOffset:$rs1, GPR:$rs2, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$dest, GPRMemZeroOffset:$rs1, GPR:$rs2,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLE</*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -871,11 +867,10 @@ class VPseudoSLoadNoMask<VReg RetClass,
class VPseudoSLoadMask<VReg RetClass,
int EEW> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- GPRMemZeroOffset:$rs1, GPR:$rs2,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, GPR:$rs2, VMaskOp:$vm, AVL:$vl,
+ sew:$sew, vec_policy:$policy)>,
RISCVVLE</*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -895,10 +890,9 @@ class VPseudoILoadNoMask<VReg RetClass,
bit Ordered,
bit EarlyClobber,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$dest, GPRMemZeroOffset:$rs1, IdxClass:$rs2, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$dest, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLX</*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -917,11 +911,10 @@ class VPseudoILoadMask<VReg RetClass,
bit Ordered,
bit EarlyClobber,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- GPRMemZeroOffset:$rs1, IdxClass:$rs2,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, IdxClass:$rs2, VMaskOp:$vm,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLX</*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -938,9 +931,9 @@ class VPseudoILoadMask<VReg RetClass,
class VPseudoUSStoreNoMask<VReg StClass,
int EEW,
DAGOperand sewop = sew> :
- Pseudo<(outs),
- (ins StClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, sewop:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins StClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl,
+ sewop:$sew)>,
RISCVVSE</*Masked*/0, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -951,10 +944,9 @@ class VPseudoUSStoreNoMask<VReg StClass,
class VPseudoUSStoreMask<VReg StClass,
int EEW> :
- Pseudo<(outs),
- (ins StClass:$rd, GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins StClass:$rd, GPRMemZeroOffset:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSE</*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -966,10 +958,9 @@ class VPseudoUSStoreMask<VReg StClass,
class VPseudoSStoreNoMask<VReg StClass,
int EEW> :
- Pseudo<(outs),
- (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2,
+ AVL:$vl, sew:$sew)>,
RISCVVSE</*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -980,10 +971,9 @@ class VPseudoSStoreNoMask<VReg StClass,
class VPseudoSStoreMask<VReg StClass,
int EEW> :
- Pseudo<(outs),
- (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSE</*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -994,10 +984,9 @@ class VPseudoSStoreMask<VReg StClass,
}
class VPseudoNullaryNoMask<VReg RegClass> :
- Pseudo<(outs RegClass:$rd),
- (ins RegClass:$passthru,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RegClass:$rd),
+ (ins RegClass:$passthru,
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1008,10 +997,10 @@ class VPseudoNullaryNoMask<VReg RegClass> :
}
class VPseudoNullaryMask<VReg RegClass> :
- Pseudo<(outs GetVRegNoV0<RegClass>.R:$rd),
- (ins GetVRegNoV0<RegClass>.R:$passthru,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RegClass>.R:$rd),
+ (ins GetVRegNoV0<RegClass>.R:$passthru,
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1026,8 +1015,7 @@ class VPseudoNullaryMask<VReg RegClass> :
// Nullary for pseudo instructions. They are expanded in
// RISCVExpandPseudoInsts pass.
class VPseudoNullaryPseudoM<string BaseInst> :
- Pseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1041,10 +1029,9 @@ class VPseudoUnaryNoMask<DAGOperand RetClass,
DAGOperand OpClass,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, OpClass:$rs2,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, OpClass:$rs2,
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1059,9 +1046,8 @@ class VPseudoUnaryNoMaskNoPolicy<DAGOperand RetClass,
DAGOperand OpClass,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1075,10 +1061,9 @@ class VPseudoUnaryNoMaskRoundingMode<DAGOperand RetClass,
DAGOperand OpClass,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, OpClass:$rs2, vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, OpClass:$rs2, vec_rm:$rm,
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1097,10 +1082,9 @@ class VPseudoUnaryMask<VReg RetClass,
string Constraint = "",
bits<2> TargetConstraintType = 1,
DAGOperand sewop = sew> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
- VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
+ VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1117,11 +1101,10 @@ class VPseudoUnaryMaskRoundingMode<VReg RetClass,
VReg OpClass,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
- VMaskOp:$vm, vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
+ VMaskOp:$vm, vec_rm:$rm,
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1155,9 +1138,8 @@ class VPseudoUnaryMask_NoExcept<VReg RetClass,
}
class VPseudoUnaryNoMaskGPROut :
- Pseudo<(outs GPR:$rd),
- (ins VR:$rs2, AVL:$vl, sew_mask:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GPR:$rd),
+ (ins VR:$rs2, AVL:$vl, sew_mask:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1166,9 +1148,8 @@ class VPseudoUnaryNoMaskGPROut :
}
class VPseudoUnaryMaskGPROut :
- Pseudo<(outs GPR:$rd),
- (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GPR:$rd),
+ (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1180,10 +1161,9 @@ class VPseudoUnaryMaskGPROut :
// Mask can be V0~V31
class VPseudoUnaryAnyMask<VReg RetClass,
VReg Op1Class> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, Op1Class:$rs2,
- VR:$vm, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, Op1Class:$rs2,
+ VR:$vm, AVL:$vl, sew:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1198,9 +1178,9 @@ class VPseudoBinaryNoMask<VReg RetClass,
string Constraint,
bits<2> TargetConstraintType = 1,
DAGOperand sewop = sew> :
- Pseudo<(outs RetClass:$rd),
- (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sewop:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sewop:$sew),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1215,10 +1195,9 @@ class VPseudoBinaryNoMaskPolicy<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1235,10 +1214,10 @@ class VPseudoBinaryNoMaskRoundingMode<VReg RetClass,
string Constraint,
bit UsesVXRM_ = 1,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
+ vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1258,12 +1237,11 @@ class VPseudoBinaryMaskPolicyRoundingMode<VReg RetClass,
string Constraint,
bit UsesVXRM_,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm, vec_rm:$rm, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ Op1Class:$rs2, Op2Class:$rs1,
+ VMaskOp:$vm, vec_rm:$rm, AVL:$vl,
+ sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1286,10 +1264,9 @@ class VPseudoTiedBinaryNoMask<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, sew:$sew,
+ vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1307,12 +1284,11 @@ class VPseudoTiedBinaryNoMaskRoundingMode<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$rs2, Op2Class:$rs1,
- vec_rm:$rm,
- AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$rs2, Op2Class:$rs1,
+ vec_rm:$rm,
+ AVL:$vl, sew:$sew,
+ vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1331,10 +1307,9 @@ class VPseudoTiedBinaryNoMaskRoundingMode<VReg RetClass,
class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
bit Ordered>:
- Pseudo<(outs),
- (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2, AVL:$vl,
- sew:$sew),[]>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
+ AVL:$vl, sew:$sew),[]>,
RISCVVSX</*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -1345,10 +1320,9 @@ class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
class VPseudoIStoreMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
bit Ordered>:
- Pseudo<(outs),
- (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
- VMaskOp:$vm, AVL:$vl, sew:$sew),[]>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
+ VMaskOp:$vm, AVL:$vl, sew:$sew),[]>,
RISCVVSX</*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -1363,11 +1337,11 @@ class VPseudoBinaryMaskPolicy<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ Op1Class:$rs2, Op2Class:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1383,11 +1357,11 @@ class VPseudoBinaryMaskPolicy<VReg RetClass,
class VPseudoTernaryMaskPolicy<VReg RetClass,
RegisterClass Op1Class,
DAGOperand Op2Class> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ Op1Class:$rs2, Op2Class:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1401,13 +1375,12 @@ class VPseudoTernaryMaskPolicy<VReg RetClass,
class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
RegisterClass Op1Class,
DAGOperand Op2Class> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm,
- vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ Op1Class:$rs2, Op2Class:$rs1,
+ VMaskOp:$vm,
+ vec_rm:$rm,
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1427,11 +1400,11 @@ class VPseudoBinaryMOutMask<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru,
- Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru,
+ Op1Class:$rs2, Op2Class:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1451,11 +1424,11 @@ class VPseudoTiedBinaryMask<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- Op2Class:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ Op2Class:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1473,13 +1446,12 @@ class VPseudoTiedBinaryMaskRoundingMode<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- Op2Class:$rs1,
- VMaskOp:$vm,
- vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ Op2Class:$rs1,
+ VMaskOp:$vm,
+ vec_rm:$rm,
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1503,13 +1475,12 @@ class VPseudoBinaryCarry<VReg RetClass,
bit CarryIn,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- !if(CarryIn,
- (ins Op1Class:$rs2, Op2Class:$rs1,
- VMV0:$carry, AVL:$vl, sew:$sew),
- (ins Op1Class:$rs2, Op2Class:$rs1,
- AVL:$vl, sew:$sew)), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ !if(CarryIn,
+ (ins Op1Class:$rs2, Op2Class:$rs1,
+ VMV0:$carry, AVL:$vl, sew:$sew),
+ (ins Op1Class:$rs2, Op2Class:$rs1,
+ AVL:$vl, sew:$sew))> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1525,10 +1496,9 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass,
DAGOperand Op2Class,
LMULInfo MInfo,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
- VMV0:$carry, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
+ VMV0:$carry, AVL:$vl, sew:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1544,10 +1514,9 @@ class VPseudoTernaryNoMask<VReg RetClass,
RegisterClass Op1Class,
DAGOperand Op2Class,
string Constraint> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
+ AVL:$vl, sew:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1561,10 +1530,9 @@ class VPseudoTernaryNoMaskWithPolicy<VReg RetClass,
DAGOperand Op2Class,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1580,10 +1548,10 @@ class VPseudoTernaryNoMaskWithPolicyRoundingMode<VReg RetClass,
DAGOperand Op2Class,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
- vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
+ vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1600,10 +1568,9 @@ class VPseudoTernaryNoMaskWithPolicyRoundingMode<VReg RetClass,
class VPseudoUSSegLoadNoMask<VReg RetClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl,
+ sew:$sew, vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1617,10 +1584,10 @@ class VPseudoUSSegLoadNoMask<VReg RetClass,
class VPseudoUSSegLoadMask<VReg RetClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew,
+ vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1636,10 +1603,9 @@ class VPseudoUSSegLoadMask<VReg RetClass,
class VPseudoUSSegLoadFFNoMask<VReg RetClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs RetClass:$rd, GPR:$vl),
- (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd, GPR:$vl),
+ (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl,
+ sew:$sew, vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1653,10 +1619,10 @@ class VPseudoUSSegLoadFFNoMask<VReg RetClass,
class VPseudoUSSegLoadFFMask<VReg RetClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
- (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$avl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew,
+ vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1672,10 +1638,9 @@ class VPseudoUSSegLoadFFMask<VReg RetClass,
class VPseudoSSegLoadNoMask<VReg RetClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, GPR:$offset, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, GPR:$offset,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1689,11 +1654,10 @@ class VPseudoSSegLoadNoMask<VReg RetClass,
class VPseudoSSegLoadMask<VReg RetClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1,
- GPR:$offset, VMaskOp:$vm, AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, GPR:$offset, VMaskOp:$vm,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1712,10 +1676,10 @@ class VPseudoISegLoadNoMask<VReg RetClass,
bits<3> LMUL,
bits<4> NF,
bit Ordered> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, IdxClass:$offset, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, GPRMemZeroOffset:$rs1,
+ IdxClass:$offset, AVL:$vl, sew:$sew,
+ vec_policy:$policy)>,
RISCVVLXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -1734,11 +1698,10 @@ class VPseudoISegLoadMask<VReg RetClass,
bits<3> LMUL,
bits<4> NF,
bit Ordered> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1,
- IdxClass:$offset, VMaskOp:$vm, AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, IdxClass:$offset, VMaskOp:$vm,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -1756,9 +1719,9 @@ class VPseudoISegLoadMask<VReg RetClass,
class VPseudoUSSegStoreNoMask<VReg ValClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs),
- (ins ValClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins ValClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, sew:$sew),
+ []>,
RISCVVSSEG<NF, /*Masked*/0, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1770,10 +1733,9 @@ class VPseudoUSSegStoreNoMask<VReg ValClass,
class VPseudoUSSegStoreMask<VReg ValClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs),
- (ins ValClass:$rd, GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins ValClass:$rd, GPRMemZeroOffset:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSSEG<NF, /*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1786,10 +1748,9 @@ class VPseudoUSSegStoreMask<VReg ValClass,
class VPseudoSSegStoreNoMask<VReg ValClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs),
- (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR:$offset,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR:$offset,
+ AVL:$vl, sew:$sew)>,
RISCVVSSEG<NF, /*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1801,10 +1762,9 @@ class VPseudoSSegStoreNoMask<VReg ValClass,
class VPseudoSSegStoreMask<VReg ValClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs),
- (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR: $offset,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR: $offset,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSSEG<NF, /*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1820,10 +1780,9 @@ class VPseudoISegStoreNoMask<VReg ValClass,
bits<3> LMUL,
bits<4> NF,
bit Ordered> :
- Pseudo<(outs),
- (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index,
+ AVL:$vl, sew:$sew)>,
RISCVVSXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -1838,10 +1797,9 @@ class VPseudoISegStoreMask<VReg ValClass,
bits<3> LMUL,
bits<4> NF,
bit Ordered> :
- Pseudo<(outs),
- (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -6745,16 +6703,14 @@ let Predicates = [HasVInstructions] in {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
let HasSEWOp = 1, BaseInstr = VMV_X_S in
def PseudoVMV_X_S:
- Pseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew), []>,
- Sched<[WriteVMovXS, ReadVMovXS]>,
- RISCVVPseudo;
+ RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew)>,
+ Sched<[WriteVMovXS, ReadVMovXS]>;
let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, isReMaterializable = 1,
Constraints = "$rd = $passthru" in
- def PseudoVMV_S_X: Pseudo<(outs VR:$rd),
+ def PseudoVMV_S_X: RISCVVPseudo<(outs VR:$rd),
(ins VR:$passthru, GPR:$rs1, AVL:$vl, sew:$sew),
[]>,
- Sched<[WriteVMovSX, ReadVMovSX_V, ReadVMovSX_X]>,
- RISCVVPseudo;
+ Sched<[WriteVMovSX, ReadVMovSX_V, ReadVMovSX_X]>;
}
} // Predicates = [HasVInstructions]
@@ -6767,18 +6723,15 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
foreach f = FPList in {
let HasSEWOp = 1, BaseInstr = VFMV_F_S in
def "PseudoVFMV_" # f.FX # "_S" :
- Pseudo<(outs f.fprclass:$rd),
- (ins VR:$rs2, sew:$sew), []>,
- Sched<[WriteVMovFS, ReadVMovFS]>,
- RISCVVPseudo;
+ RISCVVPseudo<(outs f.fprclass:$rd), (ins VR:$rs2, sew:$sew)>,
+ Sched<[WriteVMovFS, ReadVMovFS]>;
let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1,
Constraints = "$rd = $passthru" in
def "PseudoVFMV_S_" # f.FX :
- Pseudo<(outs VR:$rd),
+ RISCVVPseudo<(outs VR:$rd),
(ins VR:$passthru, f.fprclass:$rs1, AVL:$vl, sew:$sew),
[]>,
- Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>,
- RISCVVPseudo;
+ Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>;
}
}
} // Predicates = [HasVInstructionsAnyF]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 5220815..c75addd9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -11,6 +11,20 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
+// RISC-V specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDT_NDS_FMV_BF16_X
+ : SDTypeProfile<1, 1, [SDTCisVT<0, bf16>, SDTCisVT<1, XLenVT>]>;
+def SDT_NDS_FMV_X_ANYEXTBF16
+ : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisVT<1, bf16>]>;
+
+def riscv_nds_fmv_bf16_x
+ : SDNode<"RISCVISD::NDS_FMV_BF16_X", SDT_NDS_FMV_BF16_X>;
+def riscv_nds_fmv_x_anyextbf16
+ : SDNode<"RISCVISD::NDS_FMV_X_ANYEXTBF16", SDT_NDS_FMV_X_ANYEXTBF16>;
+
+//===----------------------------------------------------------------------===//
// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
@@ -448,11 +462,10 @@ class NDSRVInstVLN<bits<5> funct5, string opcodestr>
}
class VPseudoVLN8NoMask<VReg RetClass, bit U> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$dest,
- GPRMemZeroOffset:$rs1,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$dest,
+ GPRMemZeroOffset:$rs1,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []>,
RISCVNDSVLN</*Masked*/0, /*Unsigned*/U, !logtwo(8), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -464,11 +477,11 @@ class VPseudoVLN8NoMask<VReg RetClass, bit U> :
}
class VPseudoVLN8Mask<VReg RetClass, bit U> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []>,
RISCVNDSVLN</*Masked*/1, /*Unsigned*/U, !logtwo(8), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -774,6 +787,25 @@ def : Pat<(bf16 (fpround FPR32:$rs)),
(NDS_FCVT_BF16_S FPR32:$rs)>;
} // Predicates = [HasVendorXAndesBFHCvt]
+let isCodeGenOnly = 1 in {
+def NDS_FMV_BF16_X : FPUnaryOp_r<0b1111000, 0b00000, 0b000, FPR16, GPR, "fmv.w.x">,
+ Sched<[WriteFMovI32ToF32, ReadFMovI32ToF32]>;
+def NDS_FMV_X_BF16 : FPUnaryOp_r<0b1110000, 0b00000, 0b000, GPR, FPR16, "fmv.x.w">,
+ Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]>;
+}
+
+let Predicates = [HasVendorXAndesBFHCvt] in {
+def : Pat<(riscv_nds_fmv_bf16_x GPR:$src), (NDS_FMV_BF16_X GPR:$src)>;
+def : Pat<(riscv_nds_fmv_x_anyextbf16 (bf16 FPR16:$src)),
+ (NDS_FMV_X_BF16 (bf16 FPR16:$src))>;
+} // Predicates = [HasVendorXAndesBFHCvt]
+
+// Use flh/fsh to load/store bf16 if zfh is enabled.
+let Predicates = [HasStdExtZfh, HasVendorXAndesBFHCvt] in {
+def : LdPat<load, FLH, bf16>;
+def : StPat<store, FSH, FPR16, bf16>;
+} // Predicates = [HasStdExtZfh, HasVendorXAndesBFHCvt]
+
let Predicates = [HasVendorXAndesVBFHCvt] in {
defm PseudoNDS_VFWCVT_S_BF16 : VPseudoVWCVT_S_BF16;
defm PseudoNDS_VFNCVT_BF16_S : VPseudoVNCVT_BF16_S;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
index 3912eb0..ebcf079 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
@@ -154,18 +154,17 @@ foreach m = MxList in {
let VLMul = m.value in {
let BaseInstr = RI_VEXTRACT in
def PseudoRI_VEXTRACT_ # mx :
- Pseudo<(outs GPR:$rd), (ins m.vrclass:$rs2, uimm5:$idx, ixlenimm:$sew),
- []>,
- RISCVVPseudo;
+ RISCVVPseudo<(outs GPR:$rd),
+ (ins m.vrclass:$rs2, uimm5:$idx, ixlenimm:$sew),
+ []>;
let HasVLOp = 1, BaseInstr = RI_VINSERT, HasVecPolicyOp = 1,
Constraints = "$rd = $rs1" in
def PseudoRI_VINSERT_ # mx :
- Pseudo<(outs m.vrclass:$rd),
- (ins m.vrclass:$rs1, GPR:$rs2, uimm5:$idx, AVL:$vl,
- ixlenimm:$sew, ixlenimm:$policy),
- []>,
- RISCVVPseudo;
+ RISCVVPseudo<(outs m.vrclass:$rd),
+ (ins m.vrclass:$rs1, GPR:$rs2, uimm5:$idx, AVL:$vl,
+ ixlenimm:$sew, ixlenimm:$policy),
+ []>;
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 17fb75e..a47dfe3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -243,10 +243,9 @@ let Predicates = [HasVendorXSfvfnrclipxfqf], DecoderNamespace = "XSfvector",
}
class VPseudoVC_X<Operand OpClass, DAGOperand RS1Class> :
- Pseudo<(outs),
- (ins OpClass:$op1, payload5:$rs2, payload5:$rd, RS1Class:$r1,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs),
+ (ins OpClass:$op1, payload5:$rs2, payload5:$rd, RS1Class:$r1,
+ AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let HasVLOp = 1;
@@ -255,10 +254,9 @@ class VPseudoVC_X<Operand OpClass, DAGOperand RS1Class> :
}
class VPseudoVC_XV<Operand OpClass, VReg RS2Class, DAGOperand RS1Class> :
- Pseudo<(outs),
- (ins OpClass:$op1, payload5:$rd, RS2Class:$rs2, RS1Class:$r1,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs),
+ (ins OpClass:$op1, payload5:$rd, RS2Class:$rs2, RS1Class:$r1,
+ AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let HasVLOp = 1;
@@ -268,10 +266,9 @@ class VPseudoVC_XV<Operand OpClass, VReg RS2Class, DAGOperand RS1Class> :
class VPseudoVC_XVV<Operand OpClass, VReg RDClass, VReg RS2Class,
DAGOperand RS1Class> :
- Pseudo<(outs),
- (ins OpClass:$op1, RDClass:$rd, RS2Class:$rs2, RS1Class:$r1,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs),
+ (ins OpClass:$op1, RDClass:$rd, RS2Class:$rs2, RS1Class:$r1,
+ AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let HasVLOp = 1;
@@ -280,10 +277,9 @@ class VPseudoVC_XVV<Operand OpClass, VReg RDClass, VReg RS2Class,
}
class VPseudoVC_V_X<Operand OpClass, VReg RDClass, DAGOperand RS1Class> :
- Pseudo<(outs RDClass:$rd),
- (ins OpClass:$op1, payload5:$rs2, RS1Class:$r1,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RDClass:$rd),
+ (ins OpClass:$op1, payload5:$rs2, RS1Class:$r1,
+ AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let HasVLOp = 1;
@@ -293,10 +289,9 @@ class VPseudoVC_V_X<Operand OpClass, VReg RDClass, DAGOperand RS1Class> :
class VPseudoVC_V_XV<Operand OpClass, VReg RDClass, VReg RS2Class,
DAGOperand RS1Class> :
- Pseudo<(outs RDClass:$rd),
- (ins OpClass:$op1, RS2Class:$rs2, RS1Class:$r1,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RDClass:$rd),
+ (ins OpClass:$op1, RS2Class:$rs2, RS1Class:$r1,
+ AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let HasVLOp = 1;
@@ -306,10 +301,9 @@ class VPseudoVC_V_XV<Operand OpClass, VReg RDClass, VReg RS2Class,
class VPseudoVC_V_XVV<Operand OpClass, VReg RDClass, VReg RS2Class,
DAGOperand RS1Class> :
- Pseudo<(outs RDClass:$rd),
- (ins OpClass:$op1, RDClass:$rs3, RS2Class:$rs2, RS1Class:$r1,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RDClass:$rd),
+ (ins OpClass:$op1, RDClass:$rs3, RS2Class:$rs2, RS1Class:$r1,
+ AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let HasVLOp = 1;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index c7cb6e2..f391300 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1377,9 +1377,9 @@ let Predicates = [HasVendorXqciac, IsRV32] in {
def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12:$imm12))),
(QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12)>;
def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, uimm5gt3:$imm), GPRNoX0:$rs2)),
- (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>;
+ (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, uimm5gt3:$imm, GPRNoX0:$rs2)),
- (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>;
+ (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
} // Predicates = [HasVendorXqciac, IsRV32]
/// Simple arithmetic operations
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
index f173440..ed1a60a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
@@ -291,31 +291,31 @@ def : CompressPat<(MUL GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
let Predicates = [HasStdExtZcb, HasStdExtZbb] in{
def : CompressPat<(SEXT_B GPRC:$rs1, GPRC:$rs1),
- (C_SEXT_B GPRC:$rs1, GPRC:$rs1)>;
+ (C_SEXT_B GPRC:$rs1)>;
def : CompressPat<(SEXT_H GPRC:$rs1, GPRC:$rs1),
- (C_SEXT_H GPRC:$rs1, GPRC:$rs1)>;
+ (C_SEXT_H GPRC:$rs1)>;
} // Predicates = [HasStdExtZcb, HasStdExtZbb]
let Predicates = [HasStdExtZcb, HasStdExtZbb] in{
def : CompressPat<(ZEXT_H_RV32 GPRC:$rs1, GPRC:$rs1),
- (C_ZEXT_H GPRC:$rs1, GPRC:$rs1)>;
+ (C_ZEXT_H GPRC:$rs1)>;
def : CompressPat<(ZEXT_H_RV64 GPRC:$rs1, GPRC:$rs1),
- (C_ZEXT_H GPRC:$rs1, GPRC:$rs1)>;
+ (C_ZEXT_H GPRC:$rs1)>;
} // Predicates = [HasStdExtZcb, HasStdExtZbb]
let Predicates = [HasStdExtZcb] in{
def : CompressPat<(ANDI GPRC:$rs1, GPRC:$rs1, 255),
- (C_ZEXT_B GPRC:$rs1, GPRC:$rs1)>;
+ (C_ZEXT_B GPRC:$rs1)>;
} // Predicates = [HasStdExtZcb]
let Predicates = [HasStdExtZcb, HasStdExtZba, IsRV64] in{
def : CompressPat<(ADD_UW GPRC:$rs1, GPRC:$rs1, X0),
- (C_ZEXT_W GPRC:$rs1, GPRC:$rs1)>;
+ (C_ZEXT_W GPRC:$rs1)>;
} // Predicates = [HasStdExtZcb, HasStdExtZba, IsRV64]
let Predicates = [HasStdExtZcb] in{
def : CompressPat<(XORI GPRC:$rs1, GPRC:$rs1, -1),
- (C_NOT GPRC:$rs1, GPRC:$rs1)>;
+ (C_NOT GPRC:$rs1)>;
}
let Predicates = [HasStdExtZcb] in{
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 4147c97..a250ac8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -230,9 +230,8 @@ class ZvkMxSet<string vd_lmul> {
}
class VPseudoBinaryNoMask_Zvk<DAGOperand RetClass, VReg OpClass> :
- Pseudo<(outs RetClass:$rd_wb),
- (ins RetClass:$rd, OpClass:$rs2, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd_wb),
+ (ins RetClass:$rd, OpClass:$rs2, AVL:$vl, sew:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -246,10 +245,9 @@ class VPseudoBinaryNoMask_Zvk<DAGOperand RetClass, VReg OpClass> :
class VPseudoTernaryNoMask_Zvk<VReg RetClass,
VReg Op1Class,
DAGOperand Op2Class> :
- Pseudo<(outs RetClass:$rd_wb),
+ RISCVVPseudo<(outs RetClass:$rd_wb),
(ins RetClass:$rd, Op1Class:$rs2, Op2Class:$rs1,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ AVL:$vl, sew:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index dd68a55..30d8f85 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -131,25 +131,56 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
: Constant::getAllOnesValue(XLenTy);
return true;
}
- auto *VPLdSt = cast<VPIntrinsic>(I);
- assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load ||
- VPLdSt->getIntrinsicID() == Intrinsic::vp_store) &&
- "Unexpected intrinsic");
- Ptr = VPLdSt->getMemoryPointerParam();
- Alignment = VPLdSt->getPointerAlignment().value_or(
- DL.getABITypeAlign(VTy->getElementType()));
-
- assert(Mask && "vp.load and vp.store needs a mask!");
-
- Value *WideEVL = VPLdSt->getVectorLengthParam();
- // Conservatively check if EVL is a multiple of factor, otherwise some
- // (trailing) elements might be lost after the transformation.
- if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
- return false;
- auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
- VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
- return true;
+ auto *II = cast<IntrinsicInst>(I);
+ switch (II->getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unsupported intrinsic type");
+ case Intrinsic::vp_load:
+ case Intrinsic::vp_store: {
+ auto *VPLdSt = cast<VPIntrinsic>(I);
+ Ptr = VPLdSt->getMemoryPointerParam();
+ Alignment = VPLdSt->getPointerAlignment().value_or(
+ DL.getABITypeAlign(VTy->getElementType()));
+
+ assert(Mask && "vp.load and vp.store needs a mask!");
+
+ Value *WideEVL = VPLdSt->getVectorLengthParam();
+ // Conservatively check if EVL is a multiple of factor, otherwise some
+ // (trailing) elements might be lost after the transformation.
+ if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
+ return false;
+
+ auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+ VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
+ return true;
+ }
+ case Intrinsic::masked_load: {
+ Ptr = II->getOperand(0);
+ Alignment = cast<ConstantInt>(II->getArgOperand(1))->getAlignValue();
+
+ if (!isa<UndefValue>(II->getOperand(3)))
+ return false;
+
+ assert(Mask && "masked.load needs a mask!");
+
+ VL = isa<FixedVectorType>(VTy)
+ ? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
+ : Constant::getAllOnesValue(XLenTy);
+ return true;
+ }
+ case Intrinsic::masked_store: {
+ Ptr = II->getOperand(1);
+ Alignment = cast<ConstantInt>(II->getArgOperand(2))->getAlignValue();
+
+ assert(Mask && "masked.store needs a mask!");
+
+ VL = isa<FixedVectorType>(VTy)
+ ? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
+ : Constant::getAllOnesValue(XLenTy);
+ return true;
+ }
+ }
}
/// Lower an interleaved load into a vlsegN intrinsic.
@@ -173,7 +204,7 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
const DataLayout &DL = Load->getDataLayout();
auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
- auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+ auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
Value *Ptr, *VL;
Align Alignment;
@@ -193,14 +224,15 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
- // Note: Same VL as above, but i32 not xlen due to signature of
- // vp.strided.load
- VL = Builder.CreateElementCount(Builder.getInt32Ty(),
- VTy->getElementCount());
+ // For rv64, need to truncate i64 to i32 to match signature. As VL is at most
+ // the number of active lanes (which is bounded by i32) this is safe.
+ VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
+
CallInst *CI =
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
{VTy, BasePtr->getType(), Stride->getType()},
{BasePtr, Stride, Mask, VL});
+ Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes);
CI->addParamAttr(0,
Attribute::getWithAlignment(CI->getContext(), Alignment));
Shuffles[0]->replaceAllUsesWith(CI);
@@ -234,22 +266,28 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
///
/// Note that the new shufflevectors will be removed and we'll only generate one
/// vsseg3 instruction in CodeGen.
-bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
+ Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
- IRBuilder<> Builder(SI);
- const DataLayout &DL = SI->getDataLayout();
+ IRBuilder<> Builder(Store);
+ const DataLayout &DL = Store->getDataLayout();
auto Mask = SVI->getShuffleMask();
auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
// Given SVI : <n*factor x ty>, then VTy : <n x ty>
auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
ShuffleVTy->getNumElements() / Factor);
- if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(),
- SI->getPointerAddressSpace(), DL))
+ auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
+
+ Value *Ptr, *VL;
+ Align Alignment;
+ if (!getMemOperands(Factor, VTy, XLenTy, Store, Ptr, LaneMask, VL, Alignment))
return false;
- auto *PtrTy = SI->getPointerOperandType();
- auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
+ Type *PtrTy = Ptr->getType();
+ unsigned AS = PtrTy->getPointerAddressSpace();
+ if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+ return false;
unsigned Index;
// If the segment store only has one active lane (i.e. the interleave is
@@ -260,26 +298,26 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
unsigned ScalarSizeInBytes =
DL.getTypeStoreSize(ShuffleVTy->getElementType());
Value *Data = SVI->getOperand(0);
- auto *DataVTy = cast<FixedVectorType>(Data->getType());
+ Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0));
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
- Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
- Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
- Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
- VTy->getElementCount());
-
- CallInst *CI = Builder.CreateIntrinsic(
- Intrinsic::experimental_vp_strided_store,
- {Data->getType(), BasePtr->getType(), Stride->getType()},
- {Data, BasePtr, Stride, Mask, VL});
- CI->addParamAttr(
- 1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign()));
+ Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
+ // For rv64, need to truncate i64 to i32 to match signature. As VL is at
+ // most the number of active lanes (which is bounded by i32) this is safe.
+ VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
+ CallInst *CI =
+ Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
+ {VTy, BasePtr->getType(), Stride->getType()},
+ {Data, BasePtr, Stride, LaneMask, VL});
+ Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes);
+ CI->addParamAttr(1,
+ Attribute::getWithAlignment(CI->getContext(), Alignment));
return true;
}
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
- SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
+ Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
SmallVector<Value *, 10> Ops;
SmallVector<int, 16> NewShuffleMask;
@@ -295,13 +333,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
NewShuffleMask.clear();
}
- // This VL should be OK (should be executable in one vsseg instruction,
- // potentially under larger LMULs) because we checked that the fixed vector
- // type fits in isLegalInterleavedAccessType
- Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
- Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount());
- Ops.append({SI->getPointerOperand(), StoreMask, VL});
-
+ Ops.append({Ptr, LaneMask, VL});
Builder.CreateCall(VssegNFunc, Ops);
return true;
@@ -318,7 +350,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
VectorType *ResVTy = getDeinterleavedVectorType(DI);
const DataLayout &DL = Load->getDataLayout();
- auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+ auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
Value *Ptr, *VL;
Align Alignment;
@@ -339,8 +371,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
unsigned NumElts = ResVTy->getElementCount().getKnownMinValue();
Type *VecTupTy = TargetExtType::get(
Load->getContext(), "riscv.vector.tuple",
- ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
- NumElts * SEW / 8),
+ ScalableVectorType::get(Builder.getInt8Ty(), NumElts * SEW / 8),
Factor);
Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
@@ -381,7 +412,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
auto *InVTy = cast<VectorType>(InterleaveValues[0]->getType());
const DataLayout &DL = Store->getDataLayout();
- Type *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
+ Type *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
Value *Ptr, *VL;
Align Alignment;
@@ -405,9 +436,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
unsigned NumElts = InVTy->getElementCount().getKnownMinValue();
Type *VecTupTy = TargetExtType::get(
Store->getContext(), "riscv.vector.tuple",
- ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
- NumElts * SEW / 8),
- Factor);
+ ScalableVectorType::get(Builder.getInt8Ty(), NumElts * SEW / 8), Factor);
Value *StoredVal = PoisonValue::get(VecTupTy);
for (unsigned i = 0; i < Factor; ++i)
@@ -424,91 +453,3 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
Builder.CreateCall(VssegNFunc, Operands);
return true;
}
-
-/// Lower an interleaved vp.store into a vssegN intrinsic.
-///
-/// E.g. Lower an interleaved vp.store (Factor = 2):
-///
-/// %is = tail call <vscale x 64 x i8>
-/// @llvm.vector.interleave2.nxv64i8(
-/// <vscale x 32 x i8> %load0,
-/// <vscale x 32 x i8> %load1
-/// %wide.rvl = shl nuw nsw i32 %rvl, 1
-/// tail call void @llvm.vp.store.nxv64i8.p0(
-/// <vscale x 64 x i8> %is, ptr %ptr,
-/// %mask,
-/// i32 %wide.rvl)
-///
-/// Into:
-/// call void @llvm.riscv.vsseg2.mask.nxv32i8.i64(
-/// <vscale x 32 x i8> %load1,
-/// <vscale x 32 x i8> %load2, ptr %ptr,
-/// %mask,
-/// i64 %rvl)
-bool RISCVTargetLowering::lowerInterleavedVPStore(
- VPIntrinsic *Store, Value *Mask,
- ArrayRef<Value *> InterleaveOperands) const {
- assert(Mask && "Expect a valid mask");
- assert(Store->getIntrinsicID() == Intrinsic::vp_store &&
- "Unexpected intrinsic");
-
- const unsigned Factor = InterleaveOperands.size();
-
- auto *VTy = dyn_cast<VectorType>(InterleaveOperands[0]->getType());
- if (!VTy)
- return false;
-
- const DataLayout &DL = Store->getDataLayout();
- Align Alignment = Store->getParamAlign(1).value_or(
- DL.getABITypeAlign(VTy->getElementType()));
- if (!isLegalInterleavedAccessType(
- VTy, Factor, Alignment,
- Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))
- return false;
-
- IRBuilder<> Builder(Store);
- Value *WideEVL = Store->getArgOperand(3);
- // Conservatively check if EVL is a multiple of factor, otherwise some
- // (trailing) elements might be lost after the transformation.
- if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor))
- return false;
-
- auto *PtrTy = Store->getArgOperand(1)->getType();
- auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
- auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
- Value *EVL =
- Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
-
- if (isa<FixedVectorType>(VTy)) {
- SmallVector<Value *, 8> Operands(InterleaveOperands);
- Operands.append({Store->getArgOperand(1), Mask, EVL});
- Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2],
- {VTy, PtrTy, XLenTy}, Operands);
- return true;
- }
-
- unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
- unsigned NumElts = VTy->getElementCount().getKnownMinValue();
- Type *VecTupTy = TargetExtType::get(
- Store->getContext(), "riscv.vector.tuple",
- ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
- NumElts * SEW / 8),
- Factor);
-
- Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
- Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy});
- Value *StoredVal = PoisonValue::get(VecTupTy);
- for (unsigned i = 0; i < Factor; ++i)
- StoredVal = Builder.CreateCall(
- VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)});
-
- Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
- Store->getModule(), ScalableVssegIntrIds[Factor - 2],
- {VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
-
- Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL,
- ConstantInt::get(XLenTy, Log2_64(SEW))};
-
- Builder.CreateCall(VssegNFunc, Operands);
- return true;
-}
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 28d6403..3b19c34 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -48,6 +48,8 @@ using namespace llvm;
STATISTIC(NumRemovedSExtW, "Number of removed sign-extensions");
STATISTIC(NumTransformedToWInstrs,
"Number of instructions transformed to W-ops");
+STATISTIC(NumTransformedToNonWInstrs,
+ "Number of instructions transformed to non-W-ops");
static cl::opt<bool> DisableSExtWRemoval("riscv-disable-sextw-removal",
cl::desc("Disable removal of sext.w"),
@@ -67,10 +69,9 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
bool removeSExtWInstrs(MachineFunction &MF, const RISCVInstrInfo &TII,
const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
- bool stripWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII,
- const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
- bool appendWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII,
- const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
+ bool canonicalizeWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII,
+ const RISCVSubtarget &ST,
+ MachineRegisterInfo &MRI);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -721,45 +722,39 @@ bool RISCVOptWInstrs::removeSExtWInstrs(MachineFunction &MF,
return MadeChange;
}
-bool RISCVOptWInstrs::stripWSuffixes(MachineFunction &MF,
- const RISCVInstrInfo &TII,
- const RISCVSubtarget &ST,
- MachineRegisterInfo &MRI) {
+// Strips or adds W suffixes to eligible instructions depending on the
+// subtarget preferences.
+bool RISCVOptWInstrs::canonicalizeWSuffixes(MachineFunction &MF,
+ const RISCVInstrInfo &TII,
+ const RISCVSubtarget &ST,
+ MachineRegisterInfo &MRI) {
+ bool ShouldStripW = !(DisableStripWSuffix || ST.preferWInst());
+ bool ShouldPreferW = ST.preferWInst();
bool MadeChange = false;
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- unsigned Opc;
- switch (MI.getOpcode()) {
- default:
- continue;
- case RISCV::ADDW: Opc = RISCV::ADD; break;
- case RISCV::ADDIW: Opc = RISCV::ADDI; break;
- case RISCV::MULW: Opc = RISCV::MUL; break;
- case RISCV::SLLIW: Opc = RISCV::SLLI; break;
- }
- if (hasAllWUsers(MI, ST, MRI)) {
- MI.setDesc(TII.get(Opc));
- MadeChange = true;
- }
- }
- }
-
- return MadeChange;
-}
-
-bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF,
- const RISCVInstrInfo &TII,
- const RISCVSubtarget &ST,
- MachineRegisterInfo &MRI) {
- bool MadeChange = false;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
- unsigned WOpc;
- // TODO: Add more?
- switch (MI.getOpcode()) {
+ std::optional<unsigned> WOpc;
+ std::optional<unsigned> NonWOpc;
+ unsigned OrigOpc = MI.getOpcode();
+ switch (OrigOpc) {
default:
continue;
+ case RISCV::ADDW:
+ NonWOpc = RISCV::ADD;
+ break;
+ case RISCV::ADDIW:
+ NonWOpc = RISCV::ADDI;
+ break;
+ case RISCV::MULW:
+ NonWOpc = RISCV::MUL;
+ break;
+ case RISCV::SLLIW:
+ NonWOpc = RISCV::SLLI;
+ break;
+ case RISCV::SUBW:
+ NonWOpc = RISCV::SUB;
+ break;
case RISCV::ADD:
WOpc = RISCV::ADDW;
break;
@@ -773,7 +768,7 @@ bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF,
WOpc = RISCV::MULW;
break;
case RISCV::SLLI:
- // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits
+ // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits.
if (MI.getOperand(2).getImm() >= 32)
continue;
WOpc = RISCV::SLLIW;
@@ -784,19 +779,30 @@ bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF,
break;
}
- if (hasAllWUsers(MI, ST, MRI)) {
+ if (ShouldStripW && NonWOpc.has_value() && hasAllWUsers(MI, ST, MRI)) {
+ LLVM_DEBUG(dbgs() << "Replacing " << MI);
+ MI.setDesc(TII.get(NonWOpc.value()));
+ LLVM_DEBUG(dbgs() << " with " << MI);
+ ++NumTransformedToNonWInstrs;
+ MadeChange = true;
+ continue;
+ }
+ // LWU is always converted to LW when possible as 1) LW is compressible
+ // and 2) it helps minimise differences vs RV32.
+ if ((ShouldPreferW || OrigOpc == RISCV::LWU) && WOpc.has_value() &&
+ hasAllWUsers(MI, ST, MRI)) {
LLVM_DEBUG(dbgs() << "Replacing " << MI);
- MI.setDesc(TII.get(WOpc));
+ MI.setDesc(TII.get(WOpc.value()));
MI.clearFlag(MachineInstr::MIFlag::NoSWrap);
MI.clearFlag(MachineInstr::MIFlag::NoUWrap);
MI.clearFlag(MachineInstr::MIFlag::IsExact);
LLVM_DEBUG(dbgs() << " with " << MI);
++NumTransformedToWInstrs;
MadeChange = true;
+ continue;
}
}
}
-
return MadeChange;
}
@@ -813,12 +819,6 @@ bool RISCVOptWInstrs::runOnMachineFunction(MachineFunction &MF) {
bool MadeChange = false;
MadeChange |= removeSExtWInstrs(MF, TII, ST, MRI);
-
- if (!(DisableStripWSuffix || ST.preferWInst()))
- MadeChange |= stripWSuffixes(MF, TII, ST, MRI);
-
- if (ST.preferWInst())
- MadeChange |= appendWSuffixes(MF, TII, ST, MRI);
-
+ MadeChange |= canonicalizeWSuffixes(MF, TII, ST, MRI);
return MadeChange;
}
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 3e286a7..bf23812 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -24,6 +24,67 @@ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0
bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
}
+defvar SMX60VLEN = 256;
+defvar SMX60DLEN = !div(SMX60VLEN, 2);
+
+class Get1248Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 2,
+ !eq(mx, "M4") : 4,
+ !eq(mx, "M8") : 8,
+ true: 1
+ );
+}
+
+// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
+class Get4816Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 16,
+ true: 4
+ );
+}
+
+// Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
+class Get458Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M4") : 5,
+ !eq(mx, "M8") : 8,
+ true: 4
+ );
+}
+
+// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
+// Used for: widening operations
+class Get4588Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 5,
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
+ true: 4
+ );
+}
+
+// Used for: mask-producing comparisons, carry ops with mask, FP comparisons
+class Get461018Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 6,
+ !eq(mx, "M4") : 10,
+ !eq(mx, "M8") : 18,
+ true: 4
+ );
+}
+
+// Used for: e64 multiply pattern, complex ops
+class Get781632Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 8,
+ !eq(mx, "M4") : 16,
+ !eq(mx, "M8") : 32,
+ true: 7
+ );
+}
+
def SpacemitX60Model : SchedMachineModel {
let IssueWidth = 2; // dual-issue
let MicroOpBufferSize = 0; // in-order
@@ -322,58 +383,96 @@ foreach LMul = [1, 2, 4, 8] in {
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
-
- defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
-
- defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
+ // Pattern of vadd, vsub, vrsub: 4/4/5/8
+ // Pattern of vand, vor, vxor: 4/4/8/16
+ // They are grouped together, so we used the worst case 4/4/8/16
+ // TODO: use InstRW to override individual instructions' scheduling data
+ defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
+ // e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
+ // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
+ let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in {
+ defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
// Widening
+// Pattern of vwmul, vwmacc, etc: e8/e16 = 4/4/5/8, e32 = 5,5,5,8
+// We use the worst-case for all.
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
- defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
-// Vector Integer Division and Remainder
+// Division and remainder operations
+// Pattern of vdivu: 11/11/11/20/40/80/160
+// Pattern of vdiv: 12/12/12/22/44/88/176
+// Pattern of vremu: 12/12/12/22/44/88/176
+// Pattern of vrem: 13/13/13/24/48/96/192
+// We use for all: 12/12/12/24/48/96/192
+// TODO: Create separate WriteVIRem to more closely match the latencies
foreach mx = SchedMxList in {
foreach sew = SchedSEWSet<mx>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ // Slightly reduced for fractional LMULs
+ defvar Multiplier = !cond(
+ !eq(mx, "MF8") : 12,
+ !eq(mx, "MF4") : 12,
+ !eq(mx, "MF2") : 12,
+ true: 24
+ );
+
+ let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
}
}
@@ -381,12 +480,21 @@ foreach mx = SchedMxList in {
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
- defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
+ // Slightly increased for integer LMULs
+ defvar Multiplier = !cond(
+ !eq(mx, "M2") : 2,
+ !eq(mx, "M4") : 2,
+ true: 1
+ );
+
+ let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
// 12. Vector Fixed-Point Arithmetic Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp
index 668e596..6ecddad 100644
--- a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp
@@ -24,6 +24,18 @@ void RISCVSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
switch (N->getOpcode()) {
default:
return SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
+ case RISCVISD::TUPLE_EXTRACT:
+ assert(N->getNumOperands() == 2 && "Expected three operands!");
+ assert(N->getOperand(1).getOpcode() == ISD::TargetConstant &&
+ N->getOperand(1).getValueType() == MVT::i32 &&
+ "Expected index to be an i32 target constant!");
+ break;
+ case RISCVISD::TUPLE_INSERT:
+ assert(N->getNumOperands() == 3 && "Expected three operands!");
+ assert(N->getOperand(2).getOpcode() == ISD::TargetConstant &&
+ N->getOperand(2).getValueType() == MVT::i32 &&
+ "Expected index to be an i32 target constant!");
+ break;
case RISCVISD::VQDOT_VL:
case RISCVISD::VQDOTU_VL:
case RISCVISD::VQDOTSU_VL: {
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index c754de4..e35ffaf 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -216,7 +216,7 @@ unsigned RISCVSubtarget::getMinimumJumpTableEntries() const {
}
void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+ const SchedRegion &Region) const {
// Do bidirectional scheduling since it provides a more balanced scheduling
// leading to better performance. This will increase compile time.
Policy.OnlyTopDown = false;
@@ -231,8 +231,8 @@ void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Policy.ShouldTrackPressure = true;
}
-void RISCVSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+void RISCVSubtarget::overridePostRASchedPolicy(
+ MachineSchedPolicy &Policy, const SchedRegion &Region) const {
MISched::Direction PostRASchedDirection = getPostRASchedDirection();
if (PostRASchedDirection == MISched::TopDown) {
Policy.OnlyTopDown = true;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 4f560cc..fd57e02 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -395,11 +395,11 @@ public:
}
void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
};
-} // End llvm namespace
+} // namespace llvm
#endif
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b43b915..da6ac2f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -104,11 +104,6 @@ static cl::opt<bool> EnablePostMISchedLoadStoreClustering(
cl::desc("Enable PostRA load and store clustering in the machine scheduler"),
cl::init(true));
-static cl::opt<bool>
- EnableVLOptimizer("riscv-enable-vl-optimizer",
- cl::desc("Enable the RISC-V VL Optimizer pass"),
- cl::init(true), cl::Hidden);
-
static cl::opt<bool> DisableVectorMaskMutation(
"riscv-disable-vector-mask-mutation",
cl::desc("Disable the vector mask scheduling mutation"), cl::init(false),
@@ -617,8 +612,7 @@ void RISCVPassConfig::addPreRegAlloc() {
addPass(createRISCVPreRAExpandPseudoPass());
if (TM->getOptLevel() != CodeGenOptLevel::None) {
addPass(createRISCVMergeBaseOffsetOptPass());
- if (EnableVLOptimizer)
- addPass(createRISCVVLOptimizerPass());
+ addPass(createRISCVVLOptimizerPass());
}
addPass(createRISCVInsertReadWriteCSRPass());
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 56ead92..fd634b5 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1489,6 +1489,34 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
0, cast<VectorType>(ICA.getReturnType()));
}
+ case Intrinsic::fptoui_sat:
+ case Intrinsic::fptosi_sat: {
+ InstructionCost Cost = 0;
+ bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
+ Type *SrcTy = ICA.getArgTypes()[0];
+
+ auto SrcLT = getTypeLegalizationCost(SrcTy);
+ auto DstLT = getTypeLegalizationCost(RetTy);
+ if (!SrcTy->isVectorTy())
+ break;
+
+ if (!SrcLT.first.isValid() || !DstLT.first.isValid())
+ return InstructionCost::getInvalid();
+
+ Cost +=
+ getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
+ RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
+
+ // Handle NaN.
+ // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
+ // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
+ Type *CondTy = RetTy->getWithNewBitWidth(1);
+ Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
+ CmpInst::FCMP_UNO, CostKind);
+ Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+ CmpInst::FCMP_UNO, CostKind);
+ return Cost;
+ }
}
if (ST->hasVInstructions() && RetTy->isVectorTy()) {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 12bf8c1..d62d99c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -116,8 +116,8 @@ public:
}
TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override {
- return ST->hasVInstructions() ? TailFoldingStyle::Data
- : TailFoldingStyle::DataWithoutLaneMask;
+ return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL
+ : TailFoldingStyle::None;
}
std::optional<unsigned> getMaxVScale() const override;
std::optional<unsigned> getVScaleForTuning() const override;
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index e656e8b..c946451 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -33,6 +33,7 @@ namespace {
class RISCVVLOptimizer : public MachineFunctionPass {
const MachineRegisterInfo *MRI;
const MachineDominatorTree *MDT;
+ const TargetInstrInfo *TII;
public:
static char ID;
@@ -113,14 +114,6 @@ FunctionPass *llvm::createRISCVVLOptimizerPass() {
return new RISCVVLOptimizer();
}
-/// Return true if R is a physical or virtual vector register, false otherwise.
-static bool isVectorRegClass(Register R, const MachineRegisterInfo *MRI) {
- if (R.isPhysical())
- return RISCV::VRRegClass.contains(R);
- const TargetRegisterClass *RC = MRI->getRegClass(R);
- return RISCVRI::isVRegClass(RC->TSFlags);
-}
-
LLVM_ATTRIBUTE_UNUSED
static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) {
OI.print(OS);
@@ -182,37 +175,28 @@ static unsigned getIntegerExtensionOperandEEW(unsigned Factor,
return Log2EEW;
}
-/// Check whether MO is a mask operand of MI.
-static bool isMaskOperand(const MachineInstr &MI, const MachineOperand &MO,
- const MachineRegisterInfo *MRI) {
-
- if (!MO.isReg() || !isVectorRegClass(MO.getReg(), MRI))
- return false;
-
- const MCInstrDesc &Desc = MI.getDesc();
- return Desc.operands()[MO.getOperandNo()].RegClass == RISCV::VMV0RegClassID;
-}
-
static std::optional<unsigned>
getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
const MachineInstr &MI = *MO.getParent();
+ const MCInstrDesc &Desc = MI.getDesc();
const RISCVVPseudosTable::PseudoInfo *RVV =
RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
assert(RVV && "Could not find MI in PseudoTable");
// MI has a SEW associated with it. The RVV specification defines
// the EEW of each operand and definition in relation to MI.SEW.
- unsigned MILog2SEW =
- MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+ unsigned MILog2SEW = MI.getOperand(RISCVII::getSEWOpNum(Desc)).getImm();
- const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MI.getDesc());
- const bool IsTied = RISCVII::isTiedPseudo(MI.getDesc().TSFlags);
+ const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(Desc);
+ const bool IsTied = RISCVII::isTiedPseudo(Desc.TSFlags);
bool IsMODef = MO.getOperandNo() == 0 ||
(HasPassthru && MO.getOperandNo() == MI.getNumExplicitDefs());
// All mask operands have EEW=1
- if (isMaskOperand(MI, MO, MRI))
+ const MCOperandInfo &Info = Desc.operands()[MO.getOperandNo()];
+ if (Info.OperandType == MCOI::OPERAND_REGISTER &&
+ Info.RegClass == RISCV::VMV0RegClassID)
return 0;
// switch against BaseInstr to reduce number of cases that need to be
@@ -1291,11 +1275,12 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
return false;
}
- assert(!RISCVII::elementsDependOnVL(RISCV::getRVVMCOpcode(MI.getOpcode())) &&
+ assert(!RISCVII::elementsDependOnVL(
+ TII->get(RISCV::getRVVMCOpcode(MI.getOpcode())).TSFlags) &&
"Instruction shouldn't be supported if elements depend on VL");
- assert(MI.getOperand(0).isReg() &&
- isVectorRegClass(MI.getOperand(0).getReg(), MRI) &&
+ assert(RISCVRI::isVRegClass(
+ MRI->getRegClass(MI.getOperand(0).getReg())->TSFlags) &&
"All supported instructions produce a vector register result");
LLVM_DEBUG(dbgs() << "Found a candidate for VL reduction: " << MI << "\n");
@@ -1484,7 +1469,6 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
}
bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
- assert(DemandedVLs.size() == 0);
if (skipFunction(MF.getFunction()))
return false;
@@ -1495,6 +1479,10 @@ bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
if (!ST.hasVInstructions())
return false;
+ TII = ST.getInstrInfo();
+
+ assert(DemandedVLs.empty());
+
// For each instruction that defines a vector, compute what VL its
// downstream users demand.
for (MachineBasicBlock *MBB : post_order(&MF)) {
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 84ef539..c1cc19b 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -434,6 +434,15 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) {
if (!isKnownSameDefs(TrueMask.getReg(), MIMask.getReg()))
return false;
+ // Masked off lanes past TrueVL will come from False, and converting to vmv
+ // will lose these lanes unless MIVL <= TrueVL.
+ // TODO: We could relax this for False == Passthru and True policy == TU
+ const MachineOperand &MIVL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
+ const MachineOperand &TrueVL =
+ True->getOperand(RISCVII::getVLOpNum(True->getDesc()));
+ if (!RISCV::isVLKnownLE(MIVL, TrueVL))
+ return false;
+
// True's passthru needs to be equivalent to False
Register TruePassthruReg = True->getOperand(1).getReg();
Register FalseReg = MI.getOperand(2).getReg();
diff --git a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp
index bbf1d87..cfe7ef4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp
@@ -116,8 +116,8 @@ SPIRVTranslate(Module *M, std::string &SpirvObj, std::string &ErrMsg,
PM.add(new TargetLibraryInfoWrapperPass(TLII));
std::unique_ptr<MachineModuleInfoWrapperPass> MMIWP(
new MachineModuleInfoWrapperPass(Target.get()));
- const_cast<TargetLoweringObjectFile *>(Target->getObjFileLowering())
- ->Initialize(MMIWP->getMMI().getContext(), *Target);
+ Target->getObjFileLowering()->Initialize(MMIWP->getMMI().getContext(),
+ *Target);
SmallString<4096> OutBuffer;
raw_svector_ostream OutStream(OutBuffer);
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index b90e1aa..3c631ce 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -665,10 +665,10 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper(
auto *HandleType = cast<TargetExtType>(II->getOperand(0)->getType());
if (HandleType->getTargetExtName() == "spirv.Image" ||
HandleType->getTargetExtName() == "spirv.SignedImage") {
- if (II->hasOneUse()) {
- auto *U = *II->users().begin();
+ for (User *U : II->users()) {
Ty = cast<Instruction>(U)->getAccessType();
- assert(Ty && "Unable to get type for resource pointer.");
+ if (Ty)
+ break;
}
} else if (HandleType->getTargetExtName() == "spirv.VulkanBuffer") {
// This call is supposed to index into an array
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 6608b3f..d4fa62a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -296,6 +296,8 @@ private:
bool selectImageWriteIntrinsic(MachineInstr &I) const;
bool selectResourceGetPointer(Register &ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
+ bool selectModf(Register ResVReg, const SPIRVType *ResType,
+ MachineInstr &I) const;
// Utilities
std::pair<Register, bool>
@@ -3235,6 +3237,9 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
case Intrinsic::spv_discard: {
return selectDiscard(ResVReg, ResType, I);
}
+ case Intrinsic::modf: {
+ return selectModf(ResVReg, ResType, I);
+ }
default: {
std::string DiagMsg;
raw_string_ostream OS(DiagMsg);
@@ -4018,6 +4023,83 @@ bool SPIRVInstructionSelector::selectLog10(Register ResVReg,
.constrainAllUses(TII, TRI, RBI);
}
+bool SPIRVInstructionSelector::selectModf(Register ResVReg,
+ const SPIRVType *ResType,
+ MachineInstr &I) const {
+ // llvm.modf has a single arg --the number to be decomposed-- and returns a
+ // struct { restype, restype }, while OpenCLLIB::modf has two args --the
+ // number to be decomposed and a pointer--, returns the fractional part and
+ // the integral part is stored in the pointer argument. Therefore, we can't
+ // use directly the OpenCLLIB::modf intrinsic. However, we can do some
+ // scaffolding to make it work. The idea is to create an alloca instruction
+ // to get a ptr, pass this ptr to OpenCL::modf, and then load the value
+ // from this ptr to place it in the struct. llvm.modf returns the fractional
+ // part as the first element of the result, and the integral part as the
+ // second element of the result.
+
+ // At this point, the return type is not a struct anymore, but rather two
+ // independent elements of SPIRVResType. We can get each independent element
+ // from I.getDefs() or I.getOperands().
+ if (STI.canUseExtInstSet(SPIRV::InstructionSet::OpenCL_std)) {
+ MachineIRBuilder MIRBuilder(I);
+ // Get pointer type for alloca variable.
+ const SPIRVType *PtrType = GR.getOrCreateSPIRVPointerType(
+ ResType, MIRBuilder, SPIRV::StorageClass::Function);
+ // Create new register for the pointer type of alloca variable.
+ Register PtrTyReg =
+ MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::iIDRegClass);
+ MIRBuilder.getMRI()->setType(
+ PtrTyReg,
+ LLT::pointer(storageClassToAddressSpace(SPIRV::StorageClass::Function),
+ GR.getPointerSize()));
+ // Assign SPIR-V type of the pointer type of the alloca variable to the
+ // new register.
+ GR.assignSPIRVTypeToVReg(PtrType, PtrTyReg, MIRBuilder.getMF());
+ MachineBasicBlock &EntryBB = I.getMF()->front();
+ MachineBasicBlock::iterator VarPos =
+ getFirstValidInstructionInsertPoint(EntryBB);
+ auto AllocaMIB =
+ BuildMI(EntryBB, VarPos, I.getDebugLoc(), TII.get(SPIRV::OpVariable))
+ .addDef(PtrTyReg)
+ .addUse(GR.getSPIRVTypeID(PtrType))
+ .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function));
+ Register Variable = AllocaMIB->getOperand(0).getReg();
+ // Modf must have 4 operands, the first two are the 2 parts of the result,
+ // the third is the operand, and the last one is the floating point value.
+ assert(I.getNumOperands() == 4 &&
+ "Expected 4 operands for modf instruction");
+ MachineBasicBlock &BB = *I.getParent();
+ // Create the OpenCLLIB::modf instruction.
+ auto MIB =
+ BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::OpenCL_std))
+ .addImm(CL::modf)
+ .setMIFlags(I.getFlags())
+ .add(I.getOperand(3)) // Floating point value.
+ .addUse(Variable); // Pointer to integral part.
+ // Assign the integral part stored in the ptr to the second element of the
+ // result.
+ Register IntegralPartReg = I.getOperand(1).getReg();
+ if (IntegralPartReg.isValid()) {
+ // Load the value from the pointer to integral part.
+ auto LoadMIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpLoad))
+ .addDef(IntegralPartReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(Variable);
+ return LoadMIB.constrainAllUses(TII, TRI, RBI);
+ }
+
+ return MIB.constrainAllUses(TII, TRI, RBI);
+ } else if (STI.canUseExtInstSet(SPIRV::InstructionSet::GLSL_std_450)) {
+ assert(false && "GLSL::Modf is deprecated.");
+ // FIXME: GL::Modf is deprecated, use Modfstruct instead.
+ return false;
+ }
+ return false;
+}
+
// Generate the instructions to load 3-element vector builtin input
// IDs/Indices.
// Like: GlobalInvocationId, LocalInvocationId, etc....
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 2bffbf7..595424b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -380,7 +380,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
bool Changed = false;
const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F);
for (BasicBlock &BB : *F) {
- for (Instruction &I : BB) {
+ for (Instruction &I : make_early_inc_range(BB)) {
auto Call = dyn_cast<CallInst>(&I);
if (!Call)
continue;
@@ -408,12 +408,18 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
if (!STI.isShader()) {
Changed |= toSpvOverloadedIntrinsic(
II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1});
+ } else {
+ II->eraseFromParent();
+ Changed = true;
}
break;
case Intrinsic::lifetime_end:
if (!STI.isShader()) {
Changed |= toSpvOverloadedIntrinsic(
II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1});
+ } else {
+ II->eraseFromParent();
+ Changed = true;
}
break;
case Intrinsic::ptr_annotation:
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 768efb9..416d811 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -995,4 +995,27 @@ unsigned getArrayComponentCount(const MachineRegisterInfo *MRI,
return foldImm(ResType->getOperand(2), MRI);
}
+MachineBasicBlock::iterator
+getFirstValidInstructionInsertPoint(MachineBasicBlock &BB) {
+ // Find the position to insert the OpVariable instruction.
+ // We will insert it after the last OpFunctionParameter, if any, or
+ // after OpFunction otherwise.
+ MachineBasicBlock::iterator VarPos = BB.begin();
+ while (VarPos != BB.end() && VarPos->getOpcode() != SPIRV::OpFunction) {
+ ++VarPos;
+ }
+ // Advance VarPos to the next instruction after OpFunction, it will either
+ // be an OpFunctionParameter, so that we can start the next loop, or the
+ // position to insert the OpVariable instruction.
+ ++VarPos;
+ while (VarPos != BB.end() &&
+ VarPos->getOpcode() == SPIRV::OpFunctionParameter) {
+ ++VarPos;
+ }
+ // VarPos is now pointing at after the last OpFunctionParameter, if any,
+ // or after OpFunction, if no parameters.
+ return VarPos != BB.end() && VarPos->getOpcode() == SPIRV::OpLabel ? ++VarPos
+ : VarPos;
+}
+
} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index d732188..45c520a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -506,6 +506,8 @@ MachineInstr *getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI);
int64_t foldImm(const MachineOperand &MO, const MachineRegisterInfo *MRI);
unsigned getArrayComponentCount(const MachineRegisterInfo *MRI,
const MachineInstr *ResType);
+MachineBasicBlock::iterator
+getFirstValidInstructionInsertPoint(MachineBasicBlock &BB);
} // namespace llvm
#endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index 4a9c88b..a95c4ff 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/SparcFixupKinds.h"
-#include "MCTargetDesc/SparcMCAsmInfo.h"
#include "MCTargetDesc/SparcMCTargetDesc.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFObjectWriter.h"
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 1ee6e80..79da53e 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -13,10 +13,7 @@
#include "MCTargetDesc/SparcMCAsmInfo.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCValue.h"
using namespace llvm;
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 9b434d8..1aa8efe 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -2201,7 +2201,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
SDValue Chain = DAG.getEntryNode();
SDValue InGlue;
- Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL);
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InGlue);
InGlue = Chain.getValue(1);
SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
@@ -2219,7 +2219,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
InGlue};
Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
InGlue = Chain.getValue(1);
- Chain = DAG.getCALLSEQ_END(Chain, 1, 0, InGlue, DL);
+ Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
InGlue = Chain.getValue(1);
SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InGlue);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 2662241e..e6486e2 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -256,9 +256,17 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
// Precompute the set of registers that are unused, so that we can insert
// drops to their defs.
+ // And unstackify any stackified registers that don't have any uses, so that
+ // they can be dropped later. This can happen when transformations after
+ // RegStackify remove instructions using stackified registers.
BitVector UseEmpty(MRI.getNumVirtRegs());
- for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I)
- UseEmpty[I] = MRI.use_empty(Register::index2VirtReg(I));
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
+ Register Reg = Register::index2VirtReg(I);
+ if (MRI.use_empty(Reg)) {
+ UseEmpty[I] = true;
+ MFI.unstackifyVReg(Reg);
+ }
+ }
// Visit each instruction in the function.
for (MachineBasicBlock &MBB : MF) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index ac819cf..b03b350 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -15,12 +15,14 @@
#include "WebAssembly.h"
#include "WebAssemblyISelLowering.h"
#include "WebAssemblyTargetMachine.h"
+#include "WebAssemblyUtilities.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/WasmEHFuncInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h" // To access function attributes.
#include "llvm/IR/IntrinsicsWebAssembly.h"
+#include "llvm/MC/MCSymbolWasm.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/raw_ostream.h"
@@ -118,6 +120,51 @@ static SDValue getTagSymNode(int Tag, SelectionDAG *DAG) {
return DAG->getTargetExternalSymbol(SymName, PtrVT);
}
+static APInt encodeFunctionSignature(SelectionDAG *DAG, SDLoc &DL,
+ SmallVector<MVT, 4> &Returns,
+ SmallVector<MVT, 4> &Params) {
+ auto toWasmValType = [](MVT VT) {
+ if (VT == MVT::i32) {
+ return wasm::ValType::I32;
+ }
+ if (VT == MVT::i64) {
+ return wasm::ValType::I64;
+ }
+ if (VT == MVT::f32) {
+ return wasm::ValType::F32;
+ }
+ if (VT == MVT::f64) {
+ return wasm::ValType::F64;
+ }
+ LLVM_DEBUG(errs() << "Unhandled type for llvm.wasm.ref.test.func: " << VT
+ << "\n");
+ llvm_unreachable("Unhandled type for llvm.wasm.ref.test.func");
+ };
+ auto NParams = Params.size();
+ auto NReturns = Returns.size();
+ auto BitWidth = (NParams + NReturns + 2) * 64;
+ auto Sig = APInt(BitWidth, 0);
+
+ // Annoying special case: if getSignificantBits() <= 64 then InstrEmitter will
+ // emit an Imm instead of a CImm. It simplifies WebAssemblyMCInstLower if we
+ // always emit a CImm. So xor NParams with 0x7ffffff to ensure
+ // getSignificantBits() > 64
+ Sig |= NReturns ^ 0x7ffffff;
+ for (auto &Return : Returns) {
+ auto V = toWasmValType(Return);
+ Sig <<= 64;
+ Sig |= (int64_t)V;
+ }
+ Sig <<= 64;
+ Sig |= NParams;
+ for (auto &Param : Params) {
+ auto V = toWasmValType(Param);
+ Sig <<= 64;
+ Sig |= (int64_t)V;
+ }
+ return Sig;
+}
+
void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
@@ -189,6 +236,58 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, TLSAlign);
return;
}
+ case Intrinsic::wasm_ref_test_func: {
+ // First emit the TABLE_GET instruction to convert function pointer ==>
+ // funcref
+ MachineFunction &MF = CurDAG->getMachineFunction();
+ auto PtrVT = MVT::getIntegerVT(MF.getDataLayout().getPointerSizeInBits());
+ MCSymbol *Table = WebAssembly::getOrCreateFunctionTableSymbol(
+ MF.getContext(), Subtarget);
+ SDValue TableSym = CurDAG->getMCSymbol(Table, PtrVT);
+ SDValue FuncPtr = Node->getOperand(1);
+ if (Subtarget->hasAddr64() && FuncPtr.getValueType() == MVT::i64) {
+ // table.get expects an i32 but on 64 bit platforms the function pointer
+ // is an i64. In that case, i32.wrap_i64 to convert.
+ FuncPtr = SDValue(CurDAG->getMachineNode(WebAssembly::I32_WRAP_I64, DL,
+ MVT::i32, FuncPtr),
+ 0);
+ }
+ SDValue FuncRef =
+ SDValue(CurDAG->getMachineNode(WebAssembly::TABLE_GET_FUNCREF, DL,
+ MVT::funcref, TableSym, FuncPtr),
+ 0);
+
+ // Encode the signature information into the type index placeholder.
+ // This gets decoded and converted into the actual type signature in
+ // WebAssemblyMCInstLower.cpp.
+ SmallVector<MVT, 4> Params;
+ SmallVector<MVT, 4> Returns;
+
+ bool IsParam = false;
+ // Operand 0 is the return register, Operand 1 is the function pointer.
+ // The remaining operands encode the type of the function we are testing
+ // for.
+ for (unsigned I = 2, E = Node->getNumOperands(); I < E; ++I) {
+ MVT VT = Node->getOperand(I).getValueType().getSimpleVT();
+ if (VT == MVT::Untyped) {
+ IsParam = true;
+ continue;
+ }
+ if (IsParam) {
+ Params.push_back(VT);
+ } else {
+ Returns.push_back(VT);
+ }
+ }
+ auto Sig = encodeFunctionSignature(CurDAG, DL, Returns, Params);
+
+ auto SigOp = CurDAG->getTargetConstant(
+ Sig, DL, EVT::getIntegerVT(*CurDAG->getContext(), Sig.getBitWidth()));
+ MachineSDNode *RefTestNode = CurDAG->getMachineNode(
+ WebAssembly::REF_TEST_FUNCREF, DL, MVT::i32, {SigOp, FuncRef});
+ ReplaceNode(Node, RefTestNode);
+ return;
+ }
}
break;
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index bf2e04c..11936a3 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -46,6 +46,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
: TargetLowering(TM), Subtarget(&STI) {
auto MVTPtr = Subtarget->hasAddr64() ? MVT::i64 : MVT::i32;
+ // Set the load count for memcmp expand optimization
+ MaxLoadsPerMemcmp = 8;
+ MaxLoadsPerMemcmpOptSize = 4;
+
// Booleans always contain 0 or 1.
setBooleanContents(ZeroOrOneBooleanContent);
// Except in SIMD vectors
@@ -794,6 +798,7 @@ LowerCallResults(MachineInstr &CallResults, DebugLoc DL, MachineBasicBlock *BB,
if (IsIndirect) {
// Placeholder for the type index.
+ // This gets replaced with the correct value in WebAssemblyMCInstLower.cpp
MIB.addImm(0);
// The table into which this call_indirect indexes.
MCSymbolWasm *Table = IsFuncrefCall
@@ -2935,6 +2940,25 @@ performVectorExtendToFPCombine(SDNode *N,
}
static SDValue
+performVectorNonNegToFPCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ auto &DAG = DCI.DAG;
+
+ SDNodeFlags Flags = N->getFlags();
+ SDValue Op0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ // Optimize uitofp to sitofp when the sign bit is known to be zero.
+ // Depending on the target (runtime) backend, this might be performance
+ // neutral (e.g. AArch64) or a significant improvement (e.g. x86_64).
+ if (VT.isVector() && (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0))) {
+ return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
+ }
+
+ return SDValue();
+}
+
+static SDValue
performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
auto &DAG = DCI.DAG;
assert(N->getOpcode() == ISD::SIGN_EXTEND ||
@@ -3515,6 +3539,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ZERO_EXTEND:
return performVectorExtendCombine(N, DCI);
case ISD::UINT_TO_FP:
+ if (auto ExtCombine = performVectorExtendToFPCombine(N, DCI))
+ return ExtCombine;
+ return performVectorNonNegToFPCombine(N, DCI);
case ISD::SINT_TO_FP:
return performVectorExtendToFPCombine(N, DCI);
case ISD::FP_TO_SINT_SAT:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index cc36244..4613fcb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -15,13 +15,18 @@
#include "WebAssemblyMCInstLower.h"
#include "MCTargetDesc/WebAssemblyMCAsmInfo.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyMCTypeUtilities.h"
#include "TargetInfo/WebAssemblyTargetInfo.h"
#include "Utils/WebAssemblyTypeUtilities.h"
#include "WebAssemblyAsmPrinter.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/BinaryFormat/Wasm.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/IR/Constants.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
@@ -152,6 +157,34 @@ MCOperand WebAssemblyMCInstLower::lowerTypeIndexOperand(
return MCOperand::createExpr(Expr);
}
+MCOperand
+WebAssemblyMCInstLower::lowerEncodedFunctionSignature(const APInt &Sig) const {
+ // For APInt a word is 64 bits on all architectures, see definition in APInt.h
+ auto NumWords = Sig.getNumWords();
+ SmallVector<wasm::ValType, 4> Params;
+ SmallVector<wasm::ValType, 2> Returns;
+
+ int Idx = NumWords;
+ auto GetWord = [&Idx, &Sig]() {
+ Idx--;
+ return Sig.extractBitsAsZExtValue(64, 64 * Idx);
+ };
+ // Annoying special case: if getSignificantBits() <= 64 then InstrEmitter will
+ // emit an Imm instead of a CImm. It simplifies WebAssemblyMCInstLower if we
+ // always emit a CImm. So xor NParams with 0x7ffffff to ensure
+ // getSignificantBits() > 64
+ // See encodeFunctionSignature in WebAssemblyISelDAGtoDAG.cpp
+ int NReturns = GetWord() ^ 0x7ffffff;
+ for (int I = 0; I < NReturns; I++) {
+ Returns.push_back(static_cast<wasm::ValType>(GetWord()));
+ }
+ int NParams = GetWord();
+ for (int I = 0; I < NParams; I++) {
+ Params.push_back(static_cast<wasm::ValType>(GetWord()));
+ }
+ return lowerTypeIndexOperand(std::move(Returns), std::move(Params));
+}
+
static void getFunctionReturns(const MachineInstr *MI,
SmallVectorImpl<wasm::ValType> &Returns) {
const Function &F = MI->getMF()->getFunction();
@@ -196,11 +229,30 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
MCOp = MCOperand::createReg(WAReg);
break;
}
+ case llvm::MachineOperand::MO_CImmediate: {
+ // Lower type index placeholder for ref.test
+ // Currently this is the only way that CImmediates show up so panic if we
+ // get confused.
+ unsigned DescIndex = I - NumVariadicDefs;
+ assert(DescIndex < Desc.NumOperands && "unexpected CImmediate operand");
+ auto Operands = Desc.operands();
+ const MCOperandInfo &Info = Operands[DescIndex];
+ assert(Info.OperandType == WebAssembly::OPERAND_TYPEINDEX &&
+ "unexpected CImmediate operand");
+ (void)Info;
+ MCOp = lowerEncodedFunctionSignature(MO.getCImm()->getValue());
+ break;
+ }
case MachineOperand::MO_Immediate: {
unsigned DescIndex = I - NumVariadicDefs;
if (DescIndex < Desc.NumOperands) {
- const MCOperandInfo &Info = Desc.operands()[DescIndex];
+ auto Operands = Desc.operands();
+ const MCOperandInfo &Info = Operands[DescIndex];
+ // Replace type index placeholder with actual type index. The type index
+ // placeholders are Immediates and have an operand type of
+ // OPERAND_TYPEINDEX or OPERAND_SIGNATURE.
if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
+ // Lower type index placeholder for a CALL_INDIRECT instruction
SmallVector<wasm::ValType, 4> Returns;
SmallVector<wasm::ValType, 4> Params;
@@ -228,6 +280,7 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
break;
}
if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
+ // Lower type index placeholder for blocks
auto BT = static_cast<WebAssembly::BlockType>(MO.getImm());
assert(BT != WebAssembly::BlockType::Invalid);
if (BT == WebAssembly::BlockType::Multivalue) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
index 9f08499..34404d9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -36,6 +36,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
MCOperand lowerTypeIndexOperand(SmallVectorImpl<wasm::ValType> &&,
SmallVectorImpl<wasm::ValType> &&) const;
+ MCOperand lowerEncodedFunctionSignature(const APInt &Sig) const;
public:
WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 4f15999..52e7065 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -141,6 +141,21 @@ InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
}
+WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
+WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+ TTI::MemCmpExpansionOptions Options;
+
+ Options.AllowOverlappingLoads = true;
+
+ // TODO: Teach WebAssembly backend about load v128.
+
+ Options.LoadSizes.append({8, 4, 2, 1});
+ Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+ Options.NumLoadsPerBlock = Options.MaxNumLoads;
+
+ return Options;
+}
+
InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index d83b8d1..c915eeb0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -73,6 +73,10 @@ public:
getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;
+
+ TTI::MemCmpExpansionOptions
+ enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override;
+
InstructionCost getMemoryOpCost(
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b642c1c..8213e51 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1042,8 +1042,8 @@ private:
}
PrevState = CurrState;
}
- void onRParen() {
- PrevState = State;
+ bool onRParen(StringRef &ErrMsg) {
+ IntelExprState CurrState = State;
switch (State) {
default:
State = IES_ERROR;
@@ -1054,9 +1054,27 @@ private:
case IES_RBRAC:
case IES_RPAREN:
State = IES_RPAREN;
+ // In the case of a multiply, onRegister has already set IndexReg
+ // directly, with appropriate scale.
+ // Otherwise if we just saw a register it has only been stored in
+ // TmpReg, so we need to store it into the state machine.
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // no explicit scale.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ if (IndexReg)
+ return regsUseUpError(ErrMsg);
+ IndexReg = TmpReg;
+ Scale = 0;
+ }
+ }
IC.pushOperator(IC_RPAREN);
break;
}
+ PrevState = CurrState;
+ return false;
}
bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID,
const InlineAsmIdentifierInfo &IDInfo,
@@ -2172,7 +2190,11 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
}
break;
case AsmToken::LParen: SM.onLParen(); break;
- case AsmToken::RParen: SM.onRParen(); break;
+ case AsmToken::RParen:
+ if (SM.onRParen(ErrMsg)) {
+ return Error(Tok.getLoc(), ErrMsg);
+ }
+ break;
}
if (SM.hadError())
return Error(Tok.getLoc(), "unknown token in expression");
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 387d289..e213923 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -127,7 +127,6 @@ class X86AsmBackend : public MCAsmBackend {
unsigned PrevInstOpcode = 0;
MCBoundaryAlignFragment *PendingBA = nullptr;
std::pair<MCFragment *, size_t> PrevInstPosition;
- bool IsRightAfterData = false;
uint8_t determinePaddingPrefix(const MCInst &Inst) const;
bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
@@ -156,10 +155,13 @@ public:
AlignBranchType = X86AlignBranchKindLoc;
if (X86PadMaxPrefixSize.getNumOccurrences())
TargetPrefixMax = X86PadMaxPrefixSize;
+
+ AllowAutoPadding =
+ AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone;
+ AllowEnhancedRelaxation =
+ AllowAutoPadding && TargetPrefixMax != 0 && X86PadForBranchAlign;
}
- bool allowAutoPadding() const override;
- bool allowEnhancedRelaxation() const override;
void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst,
const MCSubtargetInfo &STI);
void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst);
@@ -365,14 +367,6 @@ static bool hasVariantSymbol(const MCInst &MI) {
return false;
}
-bool X86AsmBackend::allowAutoPadding() const {
- return (AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone);
-}
-
-bool X86AsmBackend::allowEnhancedRelaxation() const {
- return allowAutoPadding() && TargetPrefixMax != 0 && X86PadForBranchAlign;
-}
-
/// X86 has certain instructions which enable interrupts exactly one
/// instruction *after* the instruction which stores to SS. Return true if the
/// given instruction may have such an interrupt delay slot.
@@ -447,7 +441,7 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
// semantic.
return false;
- if (IsRightAfterData)
+ if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
// If this instruction follows any data, there is no clear
// instruction boundary, inserting a nop/prefix would change semantic.
return false;
@@ -484,13 +478,26 @@ bool X86AsmBackend::needAlign(const MCInst &Inst) const {
(AlignBranchType & X86::AlignBranchIndirect));
}
+void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst,
+ const MCSubtargetInfo &STI) {
+ bool AutoPadding = S.getAllowAutoPadding();
+ if (LLVM_LIKELY(!AutoPadding && !X86PadForAlign)) {
+ S.MCObjectStreamer::emitInstruction(Inst, STI);
+ return;
+ }
+
+ auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend());
+ Backend.emitInstructionBegin(S, Inst, STI);
+ S.MCObjectStreamer::emitInstruction(Inst, STI);
+ Backend.emitInstructionEnd(S, Inst);
+}
+
/// Insert BoundaryAlignFragment before instructions to align branches.
void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
const MCInst &Inst, const MCSubtargetInfo &STI) {
- // Used by canPadInst. Done here, because in emitInstructionEnd, the current
- // fragment will have changed.
- IsRightAfterData =
- isRightAfterData(OS.getCurrentFragment(), PrevInstPosition);
+ bool CanPadInst = canPadInst(Inst, OS);
+ if (CanPadInst)
+ OS.getCurrentFragment()->setAllowAutoPadding(true);
if (!canPadBranches(OS))
return;
@@ -504,7 +511,7 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
// we call canPadInst (not cheap) twice. However, in the common case, we can
// avoid unnecessary calls to that, as this is otherwise only used for
// relaxable fragments.
- if (!canPadInst(Inst, OS))
+ if (!CanPadInst)
return;
if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) {
@@ -542,11 +549,8 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
/// Set the last fragment to be aligned for the BoundaryAlignFragment.
void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
const MCInst &Inst) {
- MCFragment *CF = OS.getCurrentFragment();
- if (CF->getKind() == MCFragment::FT_Relaxable)
- CF->setAllowAutoPadding(canPadInst(Inst, OS));
-
// Update PrevInstOpcode here, canPadInst() reads that.
+ MCFragment *CF = OS.getCurrentFragment();
PrevInstOpcode = Inst.getOpcode();
PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
@@ -570,8 +574,7 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
OS.newFragment();
// Update the maximum alignment on the current section if necessary.
- MCSection *Sec = OS.getCurrentSectionOnly();
- Sec->ensureMinAlignment(AlignBoundary);
+ CF->getParent()->ensureMinAlignment(AlignBoundary);
}
std::optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
@@ -923,13 +926,11 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
continue;
}
- const uint64_t OrigSize = Asm.computeFragmentSize(F);
-
// To keep the effects local, prefer to relax instructions closest to
// the align directive. This is purely about human understandability
// of the resulting code. If we later find a reason to expand
// particular instructions over others, we can adjust.
- unsigned RemainingSize = OrigSize;
+ unsigned RemainingSize = Asm.computeFragmentSize(F) - F.getFixedSize();
while (!Relaxable.empty() && RemainingSize != 0) {
auto &RF = *Relaxable.pop_back_val();
// Give the backend a chance to play any tricks it wishes to increase
@@ -1542,14 +1543,6 @@ public:
};
} // end anonymous namespace
-void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst,
- const MCSubtargetInfo &STI) {
- auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend());
- Backend.emitInstructionBegin(S, Inst, STI);
- S.MCObjectStreamer::emitInstruction(Inst, STI);
- Backend.emitInstructionEnd(S, Inst);
-}
-
void X86ELFStreamer::emitInstruction(const MCInst &Inst,
const MCSubtargetInfo &STI) {
X86_MC::emitInstruction(*this, Inst, STI);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index f5eeb3b..d691538 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -11,7 +11,6 @@
//===----------------------------------------------------------------------===//
#include "X86MCAsmInfo.h"
-#include "MCTargetDesc/X86MCExpr.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/Support/CommandLine.h"
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index efb951b..e02b556 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -151,6 +151,7 @@ private:
MCSymbol *LazyPointer) override;
void emitCallInstruction(const llvm::MCInst &MCI);
+ void maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI);
// Emits a label to mark the next instruction as being relevant to Import Call
// Optimization.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6281124..11ab8dc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5001,9 +5001,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
EVT VT = Op.getValueType();
unsigned SizeInBits = VT.getSizeInBits();
- assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
unsigned NumElts = SizeInBits / EltSizeInBits;
+ // Can't split constant.
+ if ((SizeInBits % EltSizeInBits) != 0)
+ return false;
+
// Bitcast a source array of element bits to the target size.
auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
unsigned NumSrcElts = UndefSrcElts.getBitWidth();
@@ -45059,6 +45062,10 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
unsigned NumElts = DemandedElts.getBitWidth();
switch (Op.getOpcode()) {
+ case X86ISD::GlobalBaseReg:
+ case X86ISD::Wrapper:
+ case X86ISD::WrapperRIP:
+ return true;
case X86ISD::BLENDI:
case X86ISD::PSHUFD:
case X86ISD::UNPCKL:
@@ -45098,27 +45105,34 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
switch (Op.getOpcode()) {
+ // SSE vector insert/extracts use modulo indices.
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW:
+ case X86ISD::PEXTRB:
+ case X86ISD::PEXTRW:
+ return false;
// SSE vector multiplies are either inbounds or saturate.
case X86ISD::VPMADDUBSW:
case X86ISD::VPMADDWD:
+ return false;
// SSE vector shifts handle out of bounds shift amounts.
case X86ISD::VSHLI:
case X86ISD::VSRLI:
case X86ISD::VSRAI:
return false;
- // SSE blends.
+ // SSE blends.
case X86ISD::BLENDI:
case X86ISD::BLENDV:
return false;
- // SSE target shuffles.
+ // SSE target shuffles.
case X86ISD::PSHUFD:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::VPERMILPI:
case X86ISD::VPERMV3:
return false;
- // SSE comparisons handle all icmp/fcmp cases.
- // TODO: Add CMPM/MM with test coverage.
+ // SSE comparisons handle all icmp/fcmp cases.
+ // TODO: Add CMPM/MM with test coverage.
case X86ISD::CMPP:
case X86ISD::PCMPEQ:
case X86ISD::PCMPGT:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 2636979..547b221 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1668,7 +1668,8 @@ namespace llvm {
/// Lower interleaved store(s) into target specific
/// instructions/intrinsics.
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+ ShuffleVectorInst *SVI,
unsigned Factor) const override;
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 360293bc..636b072 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -822,7 +822,8 @@ bool X86TargetLowering::lowerInterleavedLoad(
return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
}
-bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
+ Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
@@ -832,6 +833,11 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
0 &&
"Invalid interleaved store");
+ auto *SI = dyn_cast<StoreInst>(Store);
+ if (!SI)
+ return false;
+ assert(!LaneMask && "Unexpected mask on store");
+
// Holds the indices of SVI that correspond to the starting index of each
// interleaved shuffle.
auto Mask = SVI->getShuffleMask();
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 45d596b..481a9be 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -32,6 +32,7 @@
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Mangler.h"
@@ -833,6 +834,7 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
CallInst.setOpcode(CallOpcode);
CallInst.addOperand(CallTargetMCOp);
OutStreamer->emitInstruction(CallInst, getSubtargetInfo());
+ maybeEmitNopAfterCallForWindowsEH(&MI);
}
// Record our statepoint node in the same section used by STACKMAP
@@ -1430,21 +1432,6 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
OutStreamer->emitLabel(FallthroughLabel);
}
-// Returns instruction preceding MBBI in MachineFunction.
-// If MBBI is the first instruction of the first basic block, returns null.
-static MachineBasicBlock::const_iterator
-PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
- const MachineBasicBlock *MBB = MBBI->getParent();
- while (MBBI == MBB->begin()) {
- if (MBB == &MBB->getParent()->front())
- return MachineBasicBlock::const_iterator();
- MBB = MBB->getPrevNode();
- MBBI = MBB->end();
- }
- --MBBI;
- return MBBI;
-}
-
static unsigned getSrcIdx(const MachineInstr* MI, unsigned SrcIdx) {
if (X86II::isKMasked(MI->getDesc().TSFlags)) {
// Skip mask operand.
@@ -2271,6 +2258,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
OutStreamer->AddComment("EVEX TO EVEX Compression ", false);
}
+ // We use this to suppress NOP padding for Windows EH.
+ bool IsTailJump = false;
+
switch (MI->getOpcode()) {
case TargetOpcode::DBG_VALUE:
llvm_unreachable("Should be handled target independently");
@@ -2325,6 +2315,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
// Lower this as normal, but add a comment.
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
case X86::TAILJMPr:
@@ -2340,6 +2331,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
// Lower these as normal, but add some comments.
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
case X86::TAILJMPm64_REX:
@@ -2349,6 +2341,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
}
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
case X86::TAILJMPr64_REX: {
@@ -2361,6 +2354,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
}
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
}
@@ -2537,26 +2531,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
case X86::SEH_BeginEpilogue: {
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
- // Windows unwinder will not invoke function's exception handler if IP is
- // either in prologue or in epilogue. This behavior causes a problem when a
- // call immediately precedes an epilogue, because the return address points
- // into the epilogue. To cope with that, we insert a 'nop' if it ends up
- // immediately after a CALL in the final emitted code.
- MachineBasicBlock::const_iterator MBBI(MI);
- // Check if preceded by a call and emit nop if so.
- for (MBBI = PrevCrossBBInst(MBBI);
- MBBI != MachineBasicBlock::const_iterator();
- MBBI = PrevCrossBBInst(MBBI)) {
- // Pseudo instructions that aren't a call are assumed to not emit any
- // code. If they do, we worst case generate unnecessary noops after a
- // call.
- if (MBBI->isCall() || !MBBI->isPseudo()) {
- if (MBBI->isCall())
- EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
- break;
- }
- }
-
EmitSEHInstruction(MI);
return;
}
@@ -2585,6 +2559,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
emitCallInstruction(TmpInst);
emitNop(*OutStreamer, 5, Subtarget);
+ maybeEmitNopAfterCallForWindowsEH(MI);
return;
}
@@ -2605,6 +2580,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
// For Import Call Optimization to work, we need a 3-byte nop after the
// call instruction.
emitNop(*OutStreamer, 3, Subtarget);
+ maybeEmitNopAfterCallForWindowsEH(MI);
return;
}
break;
@@ -2638,6 +2614,10 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
if (MI->isCall()) {
emitCallInstruction(TmpInst);
+ // Since tail calls transfer control without leaving a stack frame, there is
+ // never a need for NOP padding tail calls.
+ if (!IsTailJump)
+ maybeEmitNopAfterCallForWindowsEH(MI);
return;
}
@@ -2659,6 +2639,164 @@ void X86AsmPrinter::emitCallInstruction(const llvm::MCInst &MCI) {
OutStreamer->emitInstruction(MCI, getSubtargetInfo());
}
+// Determines whether a NOP is required after a CALL, so that Windows EH
+// IP2State tables have the correct information.
+//
+// On most Windows platforms (AMD64, ARM64, ARM32, IA64, but *not* x86-32),
+// exception handling works by looking up instruction pointers in lookup
+// tables. These lookup tables are stored in .xdata sections in executables.
+// One element of the lookup tables are the "IP2State" tables (Instruction
+// Pointer to State).
+//
+// If a function has any instructions that require cleanup during exception
+// unwinding, then it will have an IP2State table. Each entry in the IP2State
+// table describes a range of bytes in the function's instruction stream, and
+// associates an "EH state number" with that range of instructions. A value of
+// -1 means "the null state", which does not require any code to execute.
+// A value other than -1 is an index into the State table.
+//
+// The entries in the IP2State table contain byte offsets within the instruction
+// stream of the function. The Windows ABI requires that these offsets are
+// aligned to instruction boundaries; they are not permitted to point to a byte
+// that is not the first byte of an instruction.
+//
+// Unfortunately, CALL instructions present a problem during unwinding. CALL
+// instructions push the address of the instruction after the CALL instruction,
+// so that execution can resume after the CALL. If the CALL is the last
+// instruction within an IP2State region, then the return address (on the stack)
+// points to the *next* IP2State region. This means that the unwinder will
+// use the wrong cleanup funclet during unwinding.
+//
+// To fix this problem, the Windows AMD64 ABI requires that CALL instructions
+// are never placed at the end of an IP2State region. Stated equivalently, the
+// end of a CALL instruction cannot be aligned to an IP2State boundary. If a
+// CALL instruction would occur at the end of an IP2State region, then the
+// compiler must insert a NOP instruction after the CALL. The NOP instruction
+// is placed in the same EH region as the CALL instruction, so that the return
+// address points to the NOP and the unwinder will locate the correct region.
+//
+// NOP padding is only necessary on Windows AMD64 targets. On ARM64 and ARM32,
+// instructions have a fixed size so the unwinder knows how to "back up" by
+// one instruction.
+//
+// Interaction with Import Call Optimization (ICO):
+//
+// Import Call Optimization (ICO) is a compiler + OS feature on Windows which
+// improves the performance and security of DLL imports. ICO relies on using a
+// specific CALL idiom that can be replaced by the OS DLL loader. This removes
+// a load and indirect CALL and replaces it with a single direct CALL.
+//
+// To achieve this, ICO also inserts NOPs after the CALL instruction. If the
+// end of the CALL is aligned with an EH state transition, we *also* insert
+// a single-byte NOP. **Both forms of NOPs must be preserved.** They cannot
+// be combined into a single larger NOP; nor can the second NOP be removed.
+//
+// This is necessary because, if ICO is active and the call site is modified
+// by the loader, the loader will end up overwriting the NOPs that were inserted
+// for ICO. That means that those NOPs cannot be used for the correct
+// termination of the exception handling region (the IP2State transition),
+// so we still need an additional NOP instruction. The NOPs cannot be combined
+// into a longer NOP (which is ordinarily desirable) because then ICO would
+// split one instruction, producing a malformed instruction after the ICO call.
+void X86AsmPrinter::maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI) {
+ // We only need to insert NOPs after CALLs when targeting Windows on AMD64.
+ // (Don't let the name fool you: Itanium refers to table-based exception
+ // handling, not the Itanium architecture.)
+ if (MAI->getExceptionHandlingType() != ExceptionHandling::WinEH ||
+ MAI->getWinEHEncodingType() != WinEH::EncodingType::Itanium) {
+ return;
+ }
+
+ bool HasEHPersonality = MF->getWinEHFuncInfo() != nullptr;
+
+ // Set up MBB iterator, initially positioned on the same MBB as MI.
+ MachineFunction::const_iterator MFI(MI->getParent());
+ MachineFunction::const_iterator MFE(MF->end());
+
+ // Set up instruction iterator, positioned immediately *after* MI.
+ MachineBasicBlock::const_iterator MBBI(MI);
+ MachineBasicBlock::const_iterator MBBE = MI->getParent()->end();
+ ++MBBI; // Step over MI
+
+ // This loop iterates MBBs
+ for (;;) {
+ // This loop iterates instructions
+ for (; MBBI != MBBE; ++MBBI) {
+ // Check the instruction that follows this CALL.
+ const MachineInstr &NextMI = *MBBI;
+
+ // If there is an EH_LABEL after this CALL, then there is an EH state
+ // transition after this CALL. This is exactly the situation which
+ // requires NOP padding.
+ if (NextMI.isEHLabel()) {
+ if (HasEHPersonality) {
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ return;
+ }
+ // We actually want to continue, in case there is an SEH_BeginEpilogue
+ // instruction after the EH_LABEL. In some situations, IR is produced
+ // that contains EH_LABEL pseudo-instructions, even when we are not
+ // generating IP2State tables. We still need to insert a NOP before
+ // SEH_BeginEpilogue in that case.
+ continue;
+ }
+
+ // Somewhat similarly, if the CALL is the last instruction before the
+ // SEH prologue, then we also need a NOP. This is necessary because the
+ // Windows stack unwinder will not invoke a function's exception handler
+ // if the instruction pointer is in the function prologue or epilogue.
+ //
+ // We always emit a NOP before SEH_BeginEpilogue, even if there is no
+ // personality function (unwind info) for this frame. This is the same
+ // behavior as MSVC.
+ if (NextMI.getOpcode() == X86::SEH_BeginEpilogue) {
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ return;
+ }
+
+ if (!NextMI.isPseudo() && !NextMI.isMetaInstruction()) {
+ // We found a real instruction. During the CALL, the return IP will
+ // point to this instruction. Since this instruction has the same EH
+ // state as the call itself (because there is no intervening EH_LABEL),
+ // the IP2State table will be accurate; there is no need to insert a
+ // NOP.
+ return;
+ }
+
+ // The next instruction is a pseudo-op. Ignore it and keep searching.
+ // Because these instructions do not generate any machine code, they
+ // cannot prevent the IP2State table from pointing at the wrong
+ // instruction during a CALL.
+ }
+
+ // We've reached the end of this MBB. Find the next MBB in program order.
+ // MBB order should be finalized by this point, so falling across MBBs is
+ // expected.
+ ++MFI;
+ if (MFI == MFE) {
+ // No more blocks; we've reached the end of the function. This should
+ // only happen with no-return functions, but double-check to be sure.
+ if (HasEHPersonality) {
+ // If the CALL has no successors, then it is a noreturn function.
+ // Insert an INT3 instead of a NOP. This accomplishes the same purpose,
+ // but is more clear to read. Also, analysis tools will understand
+ // that they should not continue disassembling after the CALL (unless
+ // there are other branches to that label).
+ if (MI->getParent()->succ_empty())
+ EmitAndCountInstruction(MCInstBuilder(X86::INT3));
+ else
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ }
+ return;
+ }
+
+ // Set up iterator to scan the next basic block.
+ const MachineBasicBlock *NextMBB = &*MFI;
+ MBBI = NextMBB->instr_begin();
+ MBBE = NextMBB->instr_end();
+ }
+}
+
void X86AsmPrinter::emitLabelAndRecordForImportCallOptimization(
ImportCallKind Kind) {
assert(EnableImportCallOptimization);
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 9432fc2..7e35832 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -55,7 +55,7 @@ std::optional<AArch64::FMVInfo> lookupFMVByID(AArch64::ArchExtKind ExtID) {
return {};
}
-uint64_t AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
+APInt AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
// Transitively enable the Arch Extensions which correspond to each feature.
ExtensionSet FeatureBits;
for (const StringRef Feature : Features) {
@@ -69,15 +69,15 @@ uint64_t AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
}
// Construct a bitmask for all the transitively enabled Arch Extensions.
- uint64_t PriorityMask = 0;
+ APInt PriorityMask = APInt::getZero(128);
for (const FMVInfo &Info : getFMVInfo())
if (Info.ID && FeatureBits.Enabled.test(*Info.ID))
- PriorityMask |= (1ULL << Info.PriorityBit);
+ PriorityMask.setBit(Info.PriorityBit);
return PriorityMask;
}
-uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> Features) {
+APInt AArch64::getCpuSupportsMask(ArrayRef<StringRef> Features) {
// Transitively enable the Arch Extensions which correspond to each feature.
ExtensionSet FeatureBits;
for (const StringRef Feature : Features)
@@ -86,10 +86,10 @@ uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> Features) {
FeatureBits.enable(*Info->ID);
// Construct a bitmask for all the transitively enabled Arch Extensions.
- uint64_t FeaturesMask = 0;
+ APInt FeaturesMask = APInt::getZero(128);
for (const FMVInfo &Info : getFMVInfo())
if (Info.ID && FeatureBits.Enabled.test(*Info.ID))
- FeaturesMask |= (1ULL << Info.FeatureBit);
+ FeaturesMask.setBit(Info.FeatureBit);
return FeaturesMask;
}
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 4ca7444..e5c896f 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -451,6 +451,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["permlane16-swap"] = true;
Features["ashr-pk-insts"] = true;
Features["atomic-buffer-pk-add-bf16-inst"] = true;
+ Features["vmem-pref-insts"] = true;
Features["atomic-fadd-rtn-insts"] = true;
Features["atomic-buffer-global-pk-add-f16-insts"] = true;
Features["atomic-flat-pk-add-16-insts"] = true;
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index be51453..ee6651c 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -8,7 +8,6 @@
#include "llvm/TargetParser/Triple.h"
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/CodeGen.h"
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 8c156c9..7af5ba4 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -842,6 +842,162 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
return true;
}
+/// ValWidth bits starting at ValOffset of Val stored at PtrBase+PtrOffset.
+struct PartStore {
+ Value *PtrBase;
+ APInt PtrOffset;
+ Value *Val;
+ uint64_t ValOffset;
+ uint64_t ValWidth;
+ StoreInst *Store;
+
+ bool isCompatibleWith(const PartStore &Other) const {
+ return PtrBase == Other.PtrBase && Val == Other.Val;
+ }
+
+ bool operator<(const PartStore &Other) const {
+ return PtrOffset.slt(Other.PtrOffset);
+ }
+};
+
+static std::optional<PartStore> matchPartStore(Instruction &I,
+ const DataLayout &DL) {
+ auto *Store = dyn_cast<StoreInst>(&I);
+ if (!Store || !Store->isSimple())
+ return std::nullopt;
+
+ Value *StoredVal = Store->getValueOperand();
+ Type *StoredTy = StoredVal->getType();
+ if (!StoredTy->isIntegerTy() || !DL.typeSizeEqualsStoreSize(StoredTy))
+ return std::nullopt;
+
+ uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits();
+ uint64_t ValOffset = 0;
+ Value *Val;
+ if (!match(StoredVal, m_CombineOr(m_Trunc(m_LShr(m_Value(Val),
+ m_ConstantInt(ValOffset))),
+ m_Trunc(m_Value(Val)))))
+ return std::nullopt;
+
+ Value *Ptr = Store->getPointerOperand();
+ APInt PtrOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+ Value *PtrBase = Ptr->stripAndAccumulateConstantOffsets(
+ DL, PtrOffset, /*AllowNonInbounds=*/true);
+ return {{PtrBase, PtrOffset, Val, ValOffset, ValWidth, Store}};
+}
+
+static bool mergeConsecutivePartStores(ArrayRef<PartStore> Parts,
+ unsigned Width, const DataLayout &DL,
+ TargetTransformInfo &TTI) {
+ if (Parts.size() < 2)
+ return false;
+
+ // Check whether combining the stores is profitable.
+ // FIXME: We could generate smaller stores if we can't produce a large one.
+ const PartStore &First = Parts.front();
+ LLVMContext &Ctx = First.Store->getContext();
+ Type *NewTy = Type::getIntNTy(Ctx, Width);
+ unsigned Fast = 0;
+ if (!TTI.isTypeLegal(NewTy) ||
+ !TTI.allowsMisalignedMemoryAccesses(Ctx, Width,
+ First.Store->getPointerAddressSpace(),
+ First.Store->getAlign(), &Fast) ||
+ !Fast)
+ return false;
+
+ // Generate the combined store.
+ IRBuilder<> Builder(First.Store);
+ Value *Val = First.Val;
+ if (First.ValOffset != 0)
+ Val = Builder.CreateLShr(Val, First.ValOffset);
+ Val = Builder.CreateTrunc(Val, NewTy);
+ StoreInst *Store = Builder.CreateAlignedStore(
+ Val, First.Store->getPointerOperand(), First.Store->getAlign());
+
+ AAMDNodes AATags = First.Store->getAAMetadata();
+ for (const PartStore &Part : drop_begin(Parts))
+ AATags = AATags.concat(Part.Store->getAAMetadata());
+ Store->setAAMetadata(AATags);
+
+ // Remove the old stores.
+ for (const PartStore &Part : Parts)
+ Part.Store->eraseFromParent();
+
+ return true;
+}
+
+static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
+ const DataLayout &DL, TargetTransformInfo &TTI) {
+ if (Parts.size() < 2)
+ return false;
+
+ // We now have multiple parts of the same value stored to the same pointer.
+ // Sort the parts by pointer offset, and make sure they are consistent with
+ // the value offsets. Also check that the value is fully covered without
+ // overlaps.
+ bool Changed = false;
+ llvm::sort(Parts);
+ int64_t LastEndOffsetFromFirst = 0;
+ const PartStore *First = &Parts[0];
+ for (const PartStore &Part : Parts) {
+ APInt PtrOffsetFromFirst = Part.PtrOffset - First->PtrOffset;
+ int64_t ValOffsetFromFirst = Part.ValOffset - First->ValOffset;
+ if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
+ LastEndOffsetFromFirst != ValOffsetFromFirst) {
+ Changed |= mergeConsecutivePartStores(ArrayRef(First, &Part),
+ LastEndOffsetFromFirst, DL, TTI);
+ First = &Part;
+ LastEndOffsetFromFirst = Part.ValWidth;
+ continue;
+ }
+
+ LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
+ }
+
+ Changed |= mergeConsecutivePartStores(ArrayRef(First, Parts.end()),
+ LastEndOffsetFromFirst, DL, TTI);
+ return Changed;
+}
+
+static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL,
+ TargetTransformInfo &TTI, AliasAnalysis &AA) {
+ // FIXME: Add big endian support.
+ if (DL.isBigEndian())
+ return false;
+
+ BatchAAResults BatchAA(AA);
+ SmallVector<PartStore, 8> Parts;
+ bool MadeChange = false;
+ for (Instruction &I : make_early_inc_range(BB)) {
+ if (std::optional<PartStore> Part = matchPartStore(I, DL)) {
+ if (Parts.empty() || Part->isCompatibleWith(Parts[0])) {
+ Parts.push_back(std::move(*Part));
+ continue;
+ }
+
+ MadeChange |= mergePartStores(Parts, DL, TTI);
+ Parts.clear();
+ Parts.push_back(std::move(*Part));
+ continue;
+ }
+
+ if (Parts.empty())
+ continue;
+
+ if (I.mayThrow() ||
+ (I.mayReadOrWriteMemory() &&
+ isModOrRefSet(BatchAA.getModRefInfo(
+ &I, MemoryLocation::getBeforeOrAfter(Parts[0].PtrBase))))) {
+ MadeChange |= mergePartStores(Parts, DL, TTI);
+ Parts.clear();
+ continue;
+ }
+ }
+
+ MadeChange |= mergePartStores(Parts, DL, TTI);
+ return MadeChange;
+}
+
/// Combine away instructions providing they are still equivalent when compared
/// against 0. i.e do they have any bits set.
static Value *optimizeShiftInOrChain(Value *V, IRBuilder<> &Builder) {
@@ -1330,6 +1486,9 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
// bugs.
MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);
}
+
+ // Do this separately to avoid redundantly scanning stores multiple times.
+ MadeChange |= foldConsecutiveStores(BB, DL, TTI, AA);
}
// We're done with transforms, so remove dead instructions.
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index e279fec..6561b1c 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -170,6 +170,12 @@ void Lowerer::hidePromiseAlloca(CoroIdInst *CoroId, CoroBeginInst *CoroBegin) {
auto *PI = Builder.CreateIntrinsic(
Builder.getPtrTy(), Intrinsic::coro_promise, Arg, {}, "promise.addr");
PI->setCannotDuplicate();
+ // Remove lifetime markers, as these are only allowed on allocas.
+ for (User *U : make_early_inc_range(PA->users())) {
+ auto *I = cast<Instruction>(U);
+ if (I->isLifetimeStartOrEnd())
+ I->eraseFromParent();
+ }
PA->replaceUsesWithIf(PI, [CoroId](Use &U) {
bool IsBitcast = U == U.getUser()->stripPointerCasts();
bool IsCoroId = U.getUser() == CoroId;
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index a65d0fb..3320508 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -553,7 +553,6 @@ static void cacheDIVar(FrameDataInfo &FrameData,
if (I != Container.end())
DIVarCache.insert({V, (*I)->getVariable()});
};
- CacheIt(findDbgDeclares(V));
CacheIt(findDVRDeclares(V));
}
}
@@ -1219,10 +1218,8 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
auto *G = GetFramePointer(Alloca);
G->setName(Alloca->getName() + Twine(".reload.addr"));
- SmallVector<DbgVariableIntrinsic *, 4> DIs;
SmallVector<DbgVariableRecord *> DbgVariableRecords;
- findDbgUsers(DIs, Alloca, &DbgVariableRecords);
- assert(DIs.empty() && "Should never see debug-intrinsics");
+ findDbgUsers(Alloca, DbgVariableRecords);
for (auto *DVR : DbgVariableRecords)
DVR->replaceVariableLocationOp(Alloca, G);
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 59ae057..ac93f748 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -85,6 +85,9 @@ static Intrinsic::ID NonOverloadedCoroIntrinsics[] = {
Intrinsic::coro_id_async,
Intrinsic::coro_id_retcon,
Intrinsic::coro_id_retcon_once,
+ Intrinsic::coro_noop,
+ Intrinsic::coro_prepare_async,
+ Intrinsic::coro_prepare_retcon,
Intrinsic::coro_promise,
Intrinsic::coro_resume,
Intrinsic::coro_save,
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index 5fd5f7d..4e71768 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -519,10 +519,8 @@ void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
// We would handle the dbg.values for allocas specially
for (auto &Iter : Spills) {
auto *V = Iter.first;
- SmallVector<DbgValueInst *, 16> DVIs;
SmallVector<DbgVariableRecord *, 16> DVRs;
- findDbgValues(DVIs, V, &DVRs);
- assert(DVIs.empty());
+ findDbgValues(V, DVRs);
// Add the instructions which carry debug info that is in the frame.
for (DbgVariableRecord *DVR : DVRs)
if (Checker.isDefinitionAcrossSuspend(*V, DVR->Marker->MarkedInstr))
diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
index 5a87cf8..b3910c4 100644
--- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
+++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
@@ -48,6 +48,7 @@
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -63,7 +64,7 @@ static inline void eraseFromModule(T &ToErase) {
ToErase.eraseFromParent();
}
-static inline bool checkIfSupported(GlobalVariable &G) {
+static bool checkIfSupported(GlobalVariable &G) {
if (!G.isThreadLocal())
return true;
@@ -114,24 +115,221 @@ static inline void clearModule(Module &M) { // TODO: simplify.
eraseFromModule(*M.ifuncs().begin());
}
+static SmallVector<std::reference_wrapper<Use>>
+collectIndirectableUses(GlobalVariable *G) {
+ // We are interested only in use chains that end in an Instruction.
+ SmallVector<std::reference_wrapper<Use>> Uses;
+
+ SmallVector<std::reference_wrapper<Use>> Stack(G->use_begin(), G->use_end());
+ while (!Stack.empty()) {
+ Use &U = Stack.pop_back_val();
+ if (isa<Instruction>(U.getUser()))
+ Uses.emplace_back(U);
+ else
+ transform(U.getUser()->uses(), std::back_inserter(Stack),
+ [](auto &&U) { return std::ref(U); });
+ }
+
+ return Uses;
+}
+
+static inline GlobalVariable *getGlobalForName(GlobalVariable *G) {
+ // Create an anonymous global which stores the variable's name, which will be
+ // used by the HIPSTDPAR runtime to look up the program-wide symbol.
+ LLVMContext &Ctx = G->getContext();
+ auto *CDS = ConstantDataArray::getString(Ctx, G->getName());
+
+ GlobalVariable *N = G->getParent()->getOrInsertGlobal("", CDS->getType());
+ N->setInitializer(CDS);
+ N->setLinkage(GlobalValue::LinkageTypes::PrivateLinkage);
+ N->setConstant(true);
+
+ return N;
+}
+
+static inline GlobalVariable *getIndirectionGlobal(Module *M) {
+ // Create an anonymous global which stores a pointer to a pointer, which will
+ // be externally initialised by the HIPSTDPAR runtime with the address of the
+ // program-wide symbol.
+ Type *PtrTy = PointerType::get(
+ M->getContext(), M->getDataLayout().getDefaultGlobalsAddressSpace());
+ GlobalVariable *NewG = M->getOrInsertGlobal("", PtrTy);
+
+ NewG->setInitializer(PoisonValue::get(NewG->getValueType()));
+ NewG->setLinkage(GlobalValue::LinkageTypes::PrivateLinkage);
+ NewG->setConstant(true);
+ NewG->setExternallyInitialized(true);
+
+ return NewG;
+}
+
+static Constant *
+appendIndirectedGlobal(const GlobalVariable *IndirectionTable,
+ SmallVector<Constant *> &SymbolIndirections,
+ GlobalVariable *ToIndirect) {
+ Module *M = ToIndirect->getParent();
+
+ auto *InitTy = cast<StructType>(IndirectionTable->getValueType());
+ auto *SymbolListTy = cast<StructType>(InitTy->getStructElementType(2));
+ Type *NameTy = SymbolListTy->getElementType(0);
+ Type *IndirectTy = SymbolListTy->getElementType(1);
+
+ Constant *NameG = getGlobalForName(ToIndirect);
+ Constant *IndirectG = getIndirectionGlobal(M);
+ Constant *Entry = ConstantStruct::get(
+ SymbolListTy, {ConstantExpr::getAddrSpaceCast(NameG, NameTy),
+ ConstantExpr::getAddrSpaceCast(IndirectG, IndirectTy)});
+ SymbolIndirections.push_back(Entry);
+
+ return IndirectG;
+}
+
+static void fillIndirectionTable(GlobalVariable *IndirectionTable,
+ SmallVector<Constant *> Indirections) {
+ Module *M = IndirectionTable->getParent();
+ size_t SymCnt = Indirections.size();
+
+ auto *InitTy = cast<StructType>(IndirectionTable->getValueType());
+ Type *SymbolListTy = InitTy->getStructElementType(1);
+ auto *SymbolTy = cast<StructType>(InitTy->getStructElementType(2));
+
+ Constant *Count = ConstantInt::get(InitTy->getStructElementType(0), SymCnt);
+ M->removeGlobalVariable(IndirectionTable);
+ GlobalVariable *Symbols =
+ M->getOrInsertGlobal("", ArrayType::get(SymbolTy, SymCnt));
+ Symbols->setLinkage(GlobalValue::LinkageTypes::PrivateLinkage);
+ Symbols->setInitializer(
+ ConstantArray::get(ArrayType::get(SymbolTy, SymCnt), {Indirections}));
+ Symbols->setConstant(true);
+
+ Constant *ASCSymbols = ConstantExpr::getAddrSpaceCast(Symbols, SymbolListTy);
+ Constant *Init = ConstantStruct::get(
+ InitTy, {Count, ASCSymbols, PoisonValue::get(SymbolTy)});
+ M->insertGlobalVariable(IndirectionTable);
+ IndirectionTable->setInitializer(Init);
+}
+
+static void replaceWithIndirectUse(const Use &U, const GlobalVariable *G,
+ Constant *IndirectedG) {
+ auto *I = cast<Instruction>(U.getUser());
+
+ IRBuilder<> Builder(I);
+ unsigned OpIdx = U.getOperandNo();
+ Value *Op = I->getOperand(OpIdx);
+
+ // We walk back up the use chain, which could be an arbitrarily long sequence
+ // of constexpr AS casts, ptr-to-int and GEP instructions, until we reach the
+ // indirected global.
+ while (auto *CE = dyn_cast<ConstantExpr>(Op)) {
+ assert((CE->getOpcode() == Instruction::GetElementPtr ||
+ CE->getOpcode() == Instruction::AddrSpaceCast ||
+ CE->getOpcode() == Instruction::PtrToInt) &&
+ "Only GEP, ASCAST or PTRTOINT constant uses supported!");
+
+ Instruction *NewI = Builder.Insert(CE->getAsInstruction());
+ I->replaceUsesOfWith(Op, NewI);
+ I = NewI;
+ Op = I->getOperand(0);
+ OpIdx = 0;
+ Builder.SetInsertPoint(I);
+ }
+
+ assert(Op == G && "Must reach indirected global!");
+
+ I->setOperand(OpIdx, Builder.CreateLoad(G->getType(), IndirectedG));
+}
+
+static inline bool isValidIndirectionTable(GlobalVariable *IndirectionTable) {
+ std::string W;
+ raw_string_ostream OS(W);
+
+ Type *Ty = IndirectionTable->getValueType();
+ bool Valid = false;
+
+ if (!isa<StructType>(Ty)) {
+ OS << "The Indirection Table must be a struct type; ";
+ Ty->print(OS);
+ OS << " is incorrect.\n";
+ } else if (cast<StructType>(Ty)->getNumElements() != 3u) {
+ OS << "The Indirection Table must have 3 elements; "
+ << cast<StructType>(Ty)->getNumElements() << " is incorrect.\n";
+ } else if (!isa<IntegerType>(cast<StructType>(Ty)->getStructElementType(0))) {
+ OS << "The first element in the Indirection Table must be an integer; ";
+ cast<StructType>(Ty)->getStructElementType(0)->print(OS);
+ OS << " is incorrect.\n";
+ } else if (!isa<PointerType>(cast<StructType>(Ty)->getStructElementType(1))) {
+ OS << "The second element in the Indirection Table must be a pointer; ";
+ cast<StructType>(Ty)->getStructElementType(1)->print(OS);
+ OS << " is incorrect.\n";
+ } else if (!isa<StructType>(cast<StructType>(Ty)->getStructElementType(2))) {
+ OS << "The third element in the Indirection Table must be a struct type; ";
+ cast<StructType>(Ty)->getStructElementType(2)->print(OS);
+ OS << " is incorrect.\n";
+ } else {
+ Valid = true;
+ }
+
+ if (!Valid)
+ IndirectionTable->getContext().diagnose(DiagnosticInfoGeneric(W, DS_Error));
+
+ return Valid;
+}
+
+static void indirectGlobals(GlobalVariable *IndirectionTable,
+ SmallVector<GlobalVariable *> ToIndirect) {
+ // We replace globals with an indirected access via a pointer that will get
+ // set by the HIPSTDPAR runtime, using their accessible, program-wide unique
+ // address as set by the host linker-loader.
+ SmallVector<Constant *> SymbolIndirections;
+ for (auto &&G : ToIndirect) {
+ SmallVector<std::reference_wrapper<Use>> Uses = collectIndirectableUses(G);
+
+ if (Uses.empty())
+ continue;
+
+ Constant *IndirectedGlobal =
+ appendIndirectedGlobal(IndirectionTable, SymbolIndirections, G);
+
+ for_each(Uses,
+ [=](auto &&U) { replaceWithIndirectUse(U, G, IndirectedGlobal); });
+
+ eraseFromModule(*G);
+ }
+
+ if (SymbolIndirections.empty())
+ return;
+
+ fillIndirectionTable(IndirectionTable, std::move(SymbolIndirections));
+}
+
static inline void maybeHandleGlobals(Module &M) {
unsigned GlobAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
- for (auto &&G : M.globals()) { // TODO: should we handle these in the FE?
+
+ SmallVector<GlobalVariable *> ToIndirect;
+ for (auto &&G : M.globals()) {
if (!checkIfSupported(G))
return clearModule(M);
-
- if (G.isThreadLocal())
- continue;
- if (G.isConstant())
- continue;
if (G.getAddressSpace() != GlobAS)
continue;
- if (G.getLinkage() != GlobalVariable::ExternalLinkage)
+ if (G.isConstant() && G.hasInitializer() && G.hasAtLeastLocalUnnamedAddr())
continue;
- G.setLinkage(GlobalVariable::ExternalWeakLinkage);
- G.setInitializer(nullptr);
- G.setExternallyInitialized(true);
+ ToIndirect.push_back(&G);
+ }
+
+ if (ToIndirect.empty())
+ return;
+
+ if (auto *IT = M.getNamedGlobal("__hipstdpar_symbol_indirection_table")) {
+ if (!isValidIndirectionTable(IT))
+ return clearModule(M);
+ return indirectGlobals(IT, std::move(ToIndirect));
+ } else {
+ for (auto &&G : ToIndirect) {
+ // We will internalise these, so we provide a poison initialiser.
+ if (!G->hasInitializer())
+ G->setInitializer(PoisonValue::get(G->getValueType()));
+ }
}
}
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 2623be3..bdda498 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2529,7 +2529,7 @@ static bool OptimizeNonTrivialIFuncs(
bool Changed = false;
// Cache containing the mask constructed from a function's target features.
- DenseMap<Function *, uint64_t> FeatureMask;
+ DenseMap<Function *, APInt> FeatureMask;
for (GlobalIFunc &IF : M.ifuncs()) {
if (IF.isInterposable())
@@ -2568,7 +2568,7 @@ static bool OptimizeNonTrivialIFuncs(
// Sort the callee versions in decreasing priority order.
sort(Callees, [&](auto *LHS, auto *RHS) {
- return FeatureMask[LHS] > FeatureMask[RHS];
+ return FeatureMask[LHS].ugt(FeatureMask[RHS]);
});
// Find the callsites and cache the feature mask for each caller.
@@ -2591,10 +2591,10 @@ static bool OptimizeNonTrivialIFuncs(
// Sort the caller versions in decreasing priority order.
sort(Callers, [&](auto *LHS, auto *RHS) {
- return FeatureMask[LHS] > FeatureMask[RHS];
+ return FeatureMask[LHS].ugt(FeatureMask[RHS]);
});
- auto implies = [](uint64_t A, uint64_t B) { return (A & B) == B; };
+ auto implies = [](APInt A, APInt B) { return B.isSubsetOf(A); };
// Index to the highest priority candidate.
unsigned I = 0;
@@ -2603,8 +2603,8 @@ static bool OptimizeNonTrivialIFuncs(
assert(I < Callees.size() && "Found callers of equal priority");
Function *Callee = Callees[I];
- uint64_t CallerBits = FeatureMask[Caller];
- uint64_t CalleeBits = FeatureMask[Callee];
+ APInt CallerBits = FeatureMask[Caller];
+ APInt CalleeBits = FeatureMask[Callee];
// In the case of FMV callers, we know that all higher priority callers
// than the current one did not get selected at runtime, which helps
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 469f435..b803c97 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -3998,6 +3998,24 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
}
+// Update the debug information attached to NewFunc to use the clone Name. Note
+// this needs to be done for both any existing DISubprogram for the definition,
+// as well as any separate declaration DISubprogram.
+static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) {
+ assert(Name == NewFunc->getName());
+ auto *SP = NewFunc->getSubprogram();
+ if (!SP)
+ return;
+ auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
+ SP->replaceLinkageName(MDName);
+ DISubprogram *Decl = SP->getDeclaration();
+ if (!Decl)
+ return;
+ TempDISubprogram NewDecl = Decl->clone();
+ NewDecl->replaceLinkageName(MDName);
+ SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
+}
+
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
ModuleCallsiteContextGraph::cloneFunctionForCallsite(
@@ -4009,9 +4027,7 @@ ModuleCallsiteContextGraph::cloneFunctionForCallsite(
std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
assert(!Func.func()->getParent()->getFunction(Name));
NewFunc->setName(Name);
- if (auto *SP = NewFunc->getSubprogram())
- SP->replaceLinkageName(
- MDString::get(NewFunc->getParent()->getContext(), Name));
+ updateSubprogramLinkageName(NewFunc, Name);
for (auto &Inst : CallsWithMetadataInFunc) {
// This map always has the initial version in it.
assert(Inst.cloneNo() == 0);
@@ -4950,9 +4966,7 @@ static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
PrevF->eraseFromParent();
} else
NewF->setName(Name);
- if (auto *SP = NewF->getSubprogram())
- SP->replaceLinkageName(
- MDString::get(NewF->getParent()->getContext(), Name));
+ updateSubprogramLinkageName(NewF, Name);
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
<< "created clone " << ore::NV("NewFunction", NewF));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 981c527..d934638 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1355,9 +1355,9 @@ Instruction *InstCombinerImpl::
// right-shift of X and a "select".
Value *X, *Select;
Instruction *LowBitsToSkip, *Extract;
- if (!match(&I, m_c_BinOp(m_TruncOrSelf(m_CombineAnd(
- m_LShr(m_Value(X), m_Instruction(LowBitsToSkip)),
- m_Instruction(Extract))),
+ if (!match(&I, m_c_BinOp(m_TruncOrSelf(m_Instruction(
+ Extract, m_LShr(m_Value(X),
+ m_Instruction(LowBitsToSkip)))),
m_Value(Select))))
return nullptr;
@@ -1763,13 +1763,12 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
Constant *C;
// (add X, (sext/zext (icmp eq X, C)))
// -> (select (icmp eq X, C), (add C, (sext/zext 1)), X)
- auto CondMatcher = m_CombineAnd(
- m_Value(Cond),
- m_SpecificICmp(ICmpInst::ICMP_EQ, m_Deferred(A), m_ImmConstant(C)));
+ auto CondMatcher =
+ m_Value(Cond, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Deferred(A),
+ m_ImmConstant(C)));
if (match(&I,
- m_c_Add(m_Value(A),
- m_CombineAnd(m_Value(Ext), m_ZExtOrSExt(CondMatcher)))) &&
+ m_c_Add(m_Value(A), m_Value(Ext, m_ZExtOrSExt(CondMatcher)))) &&
Ext->hasOneUse()) {
Value *Add = isa<ZExtInst>(Ext) ? InstCombiner::AddOne(C)
: InstCombiner::SubOne(C);
@@ -2146,13 +2145,33 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) {
return Base;
}
+bool CommonPointerBase::isExpensive() const {
+ unsigned NumGEPs = 0;
+ auto ProcessGEPs = [&NumGEPs](ArrayRef<GEPOperator *> GEPs) {
+ bool SeenMultiUse = false;
+ for (GEPOperator *GEP : GEPs) {
+ // Only count multi-use GEPs, excluding the first one. For the first one,
+ // we will directly reuse the offset. For one-use GEPs, their offset will
+ // be folded into a multi-use GEP.
+ if (!GEP->hasOneUse()) {
+ if (SeenMultiUse)
+ ++NumGEPs;
+ SeenMultiUse = true;
+ }
+ }
+ };
+ ProcessGEPs(LHSGEPs);
+ ProcessGEPs(RHSGEPs);
+ return NumGEPs > 2;
+}
+
/// Optimize pointer differences into the same array into a size. Consider:
/// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer
/// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
Type *Ty, bool IsNUW) {
CommonPointerBase Base = CommonPointerBase::compute(LHS, RHS);
- if (!Base.Ptr)
+ if (!Base.Ptr || Base.isExpensive())
return nullptr;
// To avoid duplicating the offset arithmetic, rewrite the GEP to use the
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 3beda6b..b231c04 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2025,10 +2025,9 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
if (CountUses && !Op->hasOneUse())
return false;
- if (match(Op, m_c_BinOp(FlippedOpcode,
- m_CombineAnd(m_Value(X),
- m_Not(m_c_BinOp(Opcode, m_A, m_B))),
- m_C)))
+ if (match(Op,
+ m_c_BinOp(FlippedOpcode,
+ m_Value(X, m_Not(m_c_BinOp(Opcode, m_A, m_B))), m_C)))
return !CountUses || X->hasOneUse();
return false;
@@ -2079,10 +2078,10 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
// result is more undefined than a source:
// (~(A & B) | C) & ~(C & (A ^ B)) --> (A ^ B ^ C) | ~(A | C) is invalid.
if (Opcode == Instruction::Or && Op0->hasOneUse() &&
- match(Op1, m_OneUse(m_Not(m_CombineAnd(
- m_Value(Y),
- m_c_BinOp(Opcode, m_Specific(C),
- m_c_Xor(m_Specific(A), m_Specific(B)))))))) {
+ match(Op1,
+ m_OneUse(m_Not(m_Value(
+ Y, m_c_BinOp(Opcode, m_Specific(C),
+ m_c_Xor(m_Specific(A), m_Specific(B)))))))) {
// X = ~(A | B)
// Y = (C | (A ^ B)
Value *Or = cast<BinaryOperator>(X)->getOperand(0);
@@ -2098,12 +2097,11 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
if (match(Op0,
m_OneUse(m_c_BinOp(FlippedOpcode,
m_BinOp(FlippedOpcode, m_Value(B), m_Value(C)),
- m_CombineAnd(m_Value(X), m_Not(m_Value(A)))))) ||
- match(Op0, m_OneUse(m_c_BinOp(
- FlippedOpcode,
- m_c_BinOp(FlippedOpcode, m_Value(C),
- m_CombineAnd(m_Value(X), m_Not(m_Value(A)))),
- m_Value(B))))) {
+ m_Value(X, m_Not(m_Value(A)))))) ||
+ match(Op0, m_OneUse(m_c_BinOp(FlippedOpcode,
+ m_c_BinOp(FlippedOpcode, m_Value(C),
+ m_Value(X, m_Not(m_Value(A)))),
+ m_Value(B))))) {
// X = ~A
// (~A & B & C) | ~(A | B | C) --> ~(A | (B ^ C))
// (~A | B | C) & ~(A & B & C) --> (~A | (B ^ C))
@@ -2434,8 +2432,7 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
// (-(X & 1)) & Y --> (X & 1) == 0 ? 0 : Y
Value *Neg;
if (match(&I,
- m_c_And(m_CombineAnd(m_Value(Neg),
- m_OneUse(m_Neg(m_And(m_Value(), m_One())))),
+ m_c_And(m_Value(Neg, m_OneUse(m_Neg(m_And(m_Value(), m_One())))),
m_Value(Y)))) {
Value *Cmp = Builder.CreateIsNull(Neg);
return SelectInst::Create(Cmp, ConstantInt::getNullValue(Ty), Y);
@@ -3728,9 +3725,8 @@ static Value *foldOrUnsignedUMulOverflowICmp(BinaryOperator &I,
const APInt *C1, *C2;
if (match(&I,
m_c_Or(m_ExtractValue<1>(
- m_CombineAnd(m_Intrinsic<Intrinsic::umul_with_overflow>(
- m_Value(X), m_APInt(C1)),
- m_Value(WOV))),
+ m_Value(WOV, m_Intrinsic<Intrinsic::umul_with_overflow>(
+ m_Value(X), m_APInt(C1)))),
m_OneUse(m_SpecificCmp(ICmpInst::ICMP_UGT,
m_ExtractValue<0>(m_Deferred(WOV)),
m_APInt(C2))))) &&
@@ -3988,12 +3984,12 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
// ~(B & ?) | (A ^ B) --> ~((B & ?) & A)
Instruction *And;
if ((Op0->hasOneUse() || Op1->hasOneUse()) &&
- match(Op0, m_Not(m_CombineAnd(m_Instruction(And),
- m_c_And(m_Specific(A), m_Value())))))
+ match(Op0,
+ m_Not(m_Instruction(And, m_c_And(m_Specific(A), m_Value())))))
return BinaryOperator::CreateNot(Builder.CreateAnd(And, B));
if ((Op0->hasOneUse() || Op1->hasOneUse()) &&
- match(Op0, m_Not(m_CombineAnd(m_Instruction(And),
- m_c_And(m_Specific(B), m_Value())))))
+ match(Op0,
+ m_Not(m_Instruction(And, m_c_And(m_Specific(B), m_Value())))))
return BinaryOperator::CreateNot(Builder.CreateAnd(And, A));
// (~A | C) | (A ^ B) --> ~(A & B) | C
@@ -4125,16 +4121,13 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
// treating any non-zero result as overflow. In that case, we overflow if both
// umul.with.overflow operands are != 0, as in that case the result can only
// be 0, iff the multiplication overflows.
- if (match(&I,
- m_c_Or(m_CombineAnd(m_ExtractValue<1>(m_Value(UMulWithOv)),
- m_Value(Ov)),
- m_CombineAnd(
- m_SpecificICmp(ICmpInst::ICMP_NE,
- m_CombineAnd(m_ExtractValue<0>(
- m_Deferred(UMulWithOv)),
- m_Value(Mul)),
- m_ZeroInt()),
- m_Value(MulIsNotZero)))) &&
+ if (match(&I, m_c_Or(m_Value(Ov, m_ExtractValue<1>(m_Value(UMulWithOv))),
+ m_Value(MulIsNotZero,
+ m_SpecificICmp(
+ ICmpInst::ICMP_NE,
+ m_Value(Mul, m_ExtractValue<0>(
+ m_Deferred(UMulWithOv))),
+ m_ZeroInt())))) &&
(Ov->hasOneUse() || (MulIsNotZero->hasOneUse() && Mul->hasOneUse()))) {
Value *A, *B;
if (match(UMulWithOv, m_Intrinsic<Intrinsic::umul_with_overflow>(
@@ -4151,9 +4144,8 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
const WithOverflowInst *WO;
const Value *WOV;
const APInt *C1, *C2;
- if (match(&I, m_c_Or(m_CombineAnd(m_ExtractValue<1>(m_CombineAnd(
- m_WithOverflowInst(WO), m_Value(WOV))),
- m_Value(Ov)),
+ if (match(&I, m_c_Or(m_Value(Ov, m_ExtractValue<1>(
+ m_Value(WOV, m_WithOverflowInst(WO)))),
m_OneUse(m_ICmp(Pred, m_ExtractValue<0>(m_Deferred(WOV)),
m_APInt(C2))))) &&
(WO->getBinaryOp() == Instruction::Add ||
@@ -4501,8 +4493,7 @@ static Instruction *visitMaskedMerge(BinaryOperator &I,
Value *M;
if (!match(&I, m_c_Xor(m_Value(B),
m_OneUse(m_c_And(
- m_CombineAnd(m_c_Xor(m_Deferred(B), m_Value(X)),
- m_Value(D)),
+ m_Value(D, m_c_Xor(m_Deferred(B), m_Value(X))),
m_Value(M))))))
return nullptr;
@@ -5206,8 +5197,7 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
// (X ^ C) ^ Y --> (X ^ Y) ^ C
// Just like we do in other places, we completely avoid the fold
// for constantexprs, at least to avoid endless combine loop.
- if (match(&I, m_c_Xor(m_OneUse(m_Xor(m_CombineAnd(m_Value(X),
- m_Unless(m_ConstantExpr())),
+ if (match(&I, m_c_Xor(m_OneUse(m_Xor(m_Value(X, m_Unless(m_ConstantExpr())),
m_ImmConstant(C1))),
m_Value(Y))))
return BinaryOperator::CreateXor(Builder.CreateXor(X, Y), C1);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d88bc2c..1b78ace 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1830,10 +1830,12 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
bool IntMinIsPoison = cast<Constant>(II->getArgOperand(1))->isOneValue();
// abs(-x) -> abs(x)
- // TODO: Copy nsw if it was present on the neg?
Value *X;
- if (match(IIOperand, m_Neg(m_Value(X))))
+ if (match(IIOperand, m_Neg(m_Value(X)))) {
+ if (cast<Instruction>(IIOperand)->hasNoSignedWrap() || IntMinIsPoison)
+ replaceOperand(*II, 1, Builder.getTrue());
return replaceOperand(*II, 0, X);
+ }
if (match(IIOperand, m_c_Select(m_Neg(m_Value(X)), m_Deferred(X))))
return replaceOperand(*II, 0, X);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c90ff2a..da9b126 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -712,7 +712,7 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
};
CommonPointerBase Base = CommonPointerBase::compute(GEPLHS, RHS);
- if (Base.Ptr == RHS && CanFold(Base.LHSNW)) {
+ if (Base.Ptr == RHS && CanFold(Base.LHSNW) && !Base.isExpensive()) {
// ((gep Ptr, OFFSET) cmp Ptr) ---> (OFFSET cmp 0).
Type *IdxTy = DL.getIndexType(GEPLHS->getType());
Value *Offset =
@@ -755,8 +755,7 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
// If the base pointers are different, but the indices are the same, just
// compare the base pointer.
- Value *PtrBase = GEPLHS->getOperand(0);
- if (PtrBase != GEPRHS->getOperand(0)) {
+ if (GEPLHS->getOperand(0) != GEPRHS->getOperand(0)) {
bool IndicesTheSame =
GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
GEPLHS->getPointerOperand()->getType() ==
@@ -782,7 +781,7 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
if (GEPLHS->isInBounds() && GEPRHS->isInBounds() &&
(GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) &&
(GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) &&
- PtrBase->stripPointerCasts() ==
+ GEPLHS->getOperand(0)->stripPointerCasts() ==
GEPRHS->getOperand(0)->stripPointerCasts() &&
!GEPLHS->getType()->isVectorTy()) {
Value *LOffset = EmitGEPOffset(GEPLHS);
@@ -805,14 +804,10 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
LOffset, ROffset);
return replaceInstUsesWith(I, Cmp);
}
-
- // Otherwise, the base pointers are different and the indices are
- // different. Try convert this to an indexed compare by looking through
- // PHIs/casts.
- return transformToIndexedCompare(GEPLHS, RHS, Cond, DL, *this);
}
- if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
+ if (GEPLHS->getOperand(0) == GEPRHS->getOperand(0) &&
+ GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType()) {
// If the GEPs only differ by one index, compare it.
unsigned NumDifferences = 0; // Keep track of # differences.
@@ -849,11 +844,14 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
}
}
- if (CanFold(NW)) {
+ if (Base.Ptr && CanFold(Base.LHSNW & Base.RHSNW) && !Base.isExpensive()) {
// ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2)
- Value *L = EmitGEPOffset(GEPLHS, /*RewriteGEP=*/true);
- Value *R = EmitGEPOffset(GEPRHS, /*RewriteGEP=*/true);
- return NewICmp(NW, L, R);
+ Type *IdxTy = DL.getIndexType(GEPLHS->getType());
+ Value *L =
+ EmitGEPOffsets(Base.LHSGEPs, Base.LHSNW, IdxTy, /*RewriteGEP=*/true);
+ Value *R =
+ EmitGEPOffsets(Base.RHSGEPs, Base.RHSNW, IdxTy, /*RewriteGEP=*/true);
+ return NewICmp(Base.LHSNW & Base.RHSNW, L, R);
}
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index f7fbf08..c67e27e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -910,6 +910,9 @@ struct CommonPointerBase {
GEPNoWrapFlags RHSNW = GEPNoWrapFlags::all();
static CommonPointerBase compute(Value *LHS, Value *RHS);
+
+ /// Whether expanding the GEP chains is expensive.
+ bool isExpensive() const;
};
} // end namespace llvm
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 2cc1bc9..0be1034 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -12,7 +12,6 @@
#include "InstCombineInternal.h"
#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -1503,8 +1502,7 @@ Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
// This is a non-terminator unreachable marker. Don't remove it.
if (isa<UndefValue>(Ptr)) {
// Remove guaranteed-to-transfer instructions before the marker.
- if (removeInstructionsBeforeUnreachable(SI))
- return &SI;
+ removeInstructionsBeforeUnreachable(SI);
// Remove all instructions after the marker and handle dead blocks this
// implies.
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 503611a..e2a9255 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -219,18 +219,64 @@ Value *InstCombinerImpl::EmitGEPOffset(GEPOperator *GEP, bool RewriteGEP) {
Value *InstCombinerImpl::EmitGEPOffsets(ArrayRef<GEPOperator *> GEPs,
GEPNoWrapFlags NW, Type *IdxTy,
bool RewriteGEPs) {
- Value *Sum = nullptr;
- for (GEPOperator *GEP : reverse(GEPs)) {
- Value *Offset = EmitGEPOffset(GEP, RewriteGEPs);
- if (Offset->getType() != IdxTy)
- Offset = Builder.CreateVectorSplat(
- cast<VectorType>(IdxTy)->getElementCount(), Offset);
+ auto Add = [&](Value *Sum, Value *Offset) -> Value * {
if (Sum)
- Sum = Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(),
- NW.isInBounds());
+ return Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(),
+ NW.isInBounds());
else
- Sum = Offset;
+ return Offset;
+ };
+
+ Value *Sum = nullptr;
+ Value *OneUseSum = nullptr;
+ Value *OneUseBase = nullptr;
+ GEPNoWrapFlags OneUseFlags = GEPNoWrapFlags::all();
+ for (GEPOperator *GEP : reverse(GEPs)) {
+ Value *Offset;
+ {
+ // Expand the offset at the point of the previous GEP to enable rewriting.
+ // However, use the original insertion point for calculating Sum.
+ IRBuilderBase::InsertPointGuard Guard(Builder);
+ auto *Inst = dyn_cast<Instruction>(GEP);
+ if (RewriteGEPs && Inst)
+ Builder.SetInsertPoint(Inst);
+
+ Offset = llvm::emitGEPOffset(&Builder, DL, GEP);
+ if (Offset->getType() != IdxTy)
+ Offset = Builder.CreateVectorSplat(
+ cast<VectorType>(IdxTy)->getElementCount(), Offset);
+ if (GEP->hasOneUse()) {
+ // Offsets of one-use GEPs will be merged into the next multi-use GEP.
+ OneUseSum = Add(OneUseSum, Offset);
+ OneUseFlags = OneUseFlags.intersectForOffsetAdd(GEP->getNoWrapFlags());
+ if (!OneUseBase)
+ OneUseBase = GEP->getPointerOperand();
+ continue;
+ }
+
+ if (OneUseSum)
+ Offset = Add(OneUseSum, Offset);
+
+ // Rewrite the GEP to reuse the computed offset. This also includes
+ // offsets from preceding one-use GEPs.
+ if (RewriteGEPs && Inst &&
+ !(GEP->getSourceElementType()->isIntegerTy(8) &&
+ GEP->getOperand(1) == Offset)) {
+ replaceInstUsesWith(
+ *Inst,
+ Builder.CreatePtrAdd(
+ OneUseBase ? OneUseBase : GEP->getPointerOperand(), Offset, "",
+ OneUseFlags.intersectForOffsetAdd(GEP->getNoWrapFlags())));
+ eraseInstFromFunction(*Inst);
+ }
+ }
+
+ Sum = Add(Sum, Offset);
+ OneUseSum = OneUseBase = nullptr;
+ OneUseFlags = GEPNoWrapFlags::all();
}
+ if (OneUseSum)
+ Sum = Add(Sum, OneUseSum);
if (!Sum)
return Constant::getNullValue(IdxTy);
return Sum;
@@ -1417,10 +1463,8 @@ void InstCombinerImpl::freelyInvertAllUsersOf(Value *I, Value *IgnoredUser) {
}
// Update pre-existing debug value uses.
- SmallVector<DbgValueInst *, 4> DbgValues;
SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
- llvm::findDbgValues(DbgValues, I, &DbgVariableRecords);
- assert(DbgValues.empty());
+ llvm::findDbgValues(I, DbgVariableRecords);
for (DbgVariableRecord *DbgVal : DbgVariableRecords) {
SmallVector<uint64_t, 1> Ops = {dwarf::DW_OP_not};
@@ -3565,12 +3609,10 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
// If we are removing an alloca with a dbg.declare, insert dbg.value calls
// before each store.
- SmallVector<DbgVariableIntrinsic *, 8> DVIs;
SmallVector<DbgVariableRecord *, 8> DVRs;
std::unique_ptr<DIBuilder> DIB;
if (isa<AllocaInst>(MI)) {
- findDbgUsers(DVIs, &MI, &DVRs);
- assert(DVIs.empty());
+ findDbgUsers(&MI, DVRs);
DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
}
@@ -3692,9 +3734,6 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
//
// FIXME: the Assignment Tracking project has now likely made this
// redundant (and it's sometimes harmful).
- for (auto *DVI : DVIs)
- if (DVI->isAddressOfVariable() || DVI->getExpression()->startsWithDeref())
- DVI->eraseFromParent();
for (auto *DVR : DVRs)
if (DVR->isAddressOfVariable() || DVR->getExpression()->startsWithDeref())
DVR->eraseFromParent();
@@ -5246,10 +5285,8 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I,
// maximise the range variables have location for. If we cannot salvage, then
// mark the location undef: we know it was supposed to receive a new location
// here, but that computation has been sunk.
- SmallVector<DbgVariableIntrinsic *, 2> DbgUsers;
SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
- findDbgUsers(DbgUsers, I, &DbgVariableRecords);
- assert(DbgUsers.empty());
+ findDbgUsers(I, DbgVariableRecords);
if (!DbgVariableRecords.empty())
tryToSinkInstructionDbgVariableRecords(I, InsertPos, SrcBlock, DestBlock,
DbgVariableRecords);
@@ -5376,7 +5413,7 @@ void InstCombinerImpl::tryToSinkInstructionDbgVariableRecords(
if (DVRClones.empty())
return;
- salvageDebugInfoForDbgValues(*I, {}, DbgVariableRecordsToSalvage);
+ salvageDebugInfoForDbgValues(*I, DbgVariableRecordsToSalvage);
// The clones are in reverse order of original appearance. Assert that the
// head bit is set on the iterator as we _should_ have received it via
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 5957940..e87bee7 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1063,7 +1063,6 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
};
SmallVector<AllocaPoisonCall, 8> DynamicAllocaPoisonCallVec;
SmallVector<AllocaPoisonCall, 8> StaticAllocaPoisonCallVec;
- bool HasUntracedLifetimeIntrinsic = false;
SmallVector<AllocaInst *, 1> DynamicAllocaVec;
SmallVector<IntrinsicInst *, 1> StackRestoreVec;
@@ -1097,14 +1096,6 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
initializeCallbacks(*F.getParent());
- if (HasUntracedLifetimeIntrinsic) {
- // If there are lifetime intrinsics which couldn't be traced back to an
- // alloca, we may not know exactly when a variable enters scope, and
- // therefore should "fail safe" by not poisoning them.
- StaticAllocaPoisonCallVec.clear();
- DynamicAllocaPoisonCallVec.clear();
- }
-
processDynamicAllocas();
processStaticAllocas();
@@ -1231,13 +1222,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
!ConstantInt::isValueValidForType(IntptrTy, SizeValue))
return;
// Find alloca instruction that corresponds to llvm.lifetime argument.
- // Currently we can only handle lifetime markers pointing to the
- // beginning of the alloca.
- AllocaInst *AI = findAllocaForValue(II.getArgOperand(1), true);
- if (!AI) {
- HasUntracedLifetimeIntrinsic = true;
- return;
- }
+ AllocaInst *AI = cast<AllocaInst>(II.getArgOperand(1));
// We're interested only in allocas we can handle.
if (!ASan.isInterestingAlloca(*AI))
return;
@@ -3637,6 +3622,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
"Variable descriptions relative to ASan stack base will be dropped");
// Replace Alloca instructions with base+offset.
+ SmallVector<Value *> NewAllocaPtrs;
for (const auto &Desc : SVD) {
AllocaInst *AI = Desc.AI;
replaceDbgDeclare(AI, LocalStackBaseAllocaPtr, DIB, DIExprFlags,
@@ -3645,6 +3631,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
AI->getType());
AI->replaceAllUsesWith(NewAllocaPtr);
+ NewAllocaPtrs.push_back(NewAllocaPtr);
}
// The left-most redzone has enough space for at least 4 pointers.
@@ -3694,6 +3681,15 @@ void FunctionStackPoisoner::processStaticAllocas() {
}
}
+ // Remove lifetime markers now that these are no longer allocas.
+ for (Value *NewAllocaPtr : NewAllocaPtrs) {
+ for (User *U : make_early_inc_range(NewAllocaPtr->users())) {
+ auto *I = cast<Instruction>(U);
+ if (I->isLifetimeStartOrEnd())
+ I->eraseFromParent();
+ }
+ }
+
SmallVector<uint8_t, 64> ShadowClean(ShadowAfterScope.size(), 0);
SmallVector<uint8_t, 64> ShadowAfterReturn;
@@ -3829,6 +3825,13 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType());
+ // Remove lifetime markers now that this is no longer an alloca.
+ for (User *U : make_early_inc_range(AI->users())) {
+ auto *I = cast<Instruction>(U);
+ if (I->isLifetimeStartOrEnd())
+ I->eraseFromParent();
+ }
+
// Replace all uses of AddessReturnedByAlloca with NewAddressPtr.
AI->replaceAllUsesWith(NewAddressPtr);
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 2c34bf2..fe3315e 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -363,10 +363,10 @@ private:
void tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
- bool instrumentStack(memtag::StackInfo &Info, Value *StackTag, Value *UARTag,
+ void instrumentStack(memtag::StackInfo &Info, Value *StackTag, Value *UARTag,
const DominatorTree &DT, const PostDominatorTree &PDT,
const LoopInfo &LI);
- bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
+ void instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
Value *getNextTagWithCall(IRBuilder<> &IRB);
Value *getStackBaseTag(IRBuilder<> &IRB);
Value *getAllocaTag(IRBuilder<> &IRB, Value *StackTag, unsigned AllocaNo);
@@ -1418,7 +1418,7 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
}
}
-bool HWAddressSanitizer::instrumentLandingPads(
+void HWAddressSanitizer::instrumentLandingPads(
SmallVectorImpl<Instruction *> &LandingPadVec) {
for (auto *LP : LandingPadVec) {
IRBuilder<> IRB(LP->getNextNode());
@@ -1427,10 +1427,9 @@ bool HWAddressSanitizer::instrumentLandingPads(
{memtag::readRegister(
IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp" : "sp")});
}
- return true;
}
-bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
+void HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
Value *StackTag, Value *UARTag,
const DominatorTree &DT,
const PostDominatorTree &PDT,
@@ -1500,7 +1499,6 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
// statement if return_twice functions are called.
bool StandardLifetime =
!SInfo.CallsReturnTwice &&
- SInfo.UnrecognizedLifetimes.empty() &&
memtag::isStandardLifetime(Info.LifetimeStart, Info.LifetimeEnd, &DT,
&LI, ClMaxLifetimes);
if (DetectUseAfterScope && StandardLifetime) {
@@ -1525,9 +1523,6 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
}
memtag::alignAndPadAlloca(Info, Mapping.getObjectAlignment());
}
- for (auto &I : SInfo.UnrecognizedLifetimes)
- I->eraseFromParent();
- return true;
}
static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE,
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 854db0f..f451c2b 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -80,6 +80,27 @@ static cl::opt<unsigned>
ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden,
cl::desc("Skip Callsite up to this number for this compilation"));
+// ICP the candidate function even when only a declaration is present.
+static cl::opt<bool> ICPAllowDecls(
+ "icp-allow-decls", cl::init(false), cl::Hidden,
+ cl::desc("Promote the target candidate even when the defintion "
+ " is not available"));
+
+// ICP hot candidate functions only. When setting to false, non-cold functions
+// (warm functions) can also be promoted.
+static cl::opt<bool>
+ ICPAllowHotOnly("icp-allow-hot-only", cl::init(true), cl::Hidden,
+ cl::desc("Promote the target candidate only if it is a "
+ "hot function. Otherwise, warm functions can "
+ "also be promoted"));
+
+// If one target cannot be ICP'd, proceed with the remaining targets instead
+// of exiting the callsite.
+static cl::opt<bool> ICPAllowCandidateSkip(
+ "icp-allow-candidate-skip", cl::init(false), cl::Hidden,
+ cl::desc("Continue with the remaining targets instead of exiting "
+ "when failing in a candidate"));
+
// Set if the pass is called in LTO optimization. The difference for LTO mode
// is the pass won't prefix the source module name to the internal linkage
// symbols.
@@ -330,6 +351,7 @@ private:
struct PromotionCandidate {
Function *const TargetFunction;
const uint64_t Count;
+ const uint32_t Index;
// The following fields only exists for promotion candidates with vtable
// information.
@@ -341,7 +363,8 @@ private:
VTableGUIDCountsMap VTableGUIDAndCounts;
SmallVector<Constant *> AddressPoints;
- PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {}
+ PromotionCandidate(Function *F, uint64_t C, uint32_t I)
+ : TargetFunction(F), Count(C), Index(I) {}
};
// Check if the indirect-call call site should be promoted. Return the number
@@ -356,12 +379,10 @@ private:
// Promote a list of targets for one indirect-call callsite by comparing
// indirect callee with functions. Return true if there are IR
// transformations and false otherwise.
- bool tryToPromoteWithFuncCmp(CallBase &CB, Instruction *VPtr,
- ArrayRef<PromotionCandidate> Candidates,
- uint64_t TotalCount,
- ArrayRef<InstrProfValueData> ICallProfDataRef,
- uint32_t NumCandidates,
- VTableGUIDCountsMap &VTableGUIDCounts);
+ bool tryToPromoteWithFuncCmp(
+ CallBase &CB, Instruction *VPtr, ArrayRef<PromotionCandidate> Candidates,
+ uint64_t TotalCount, MutableArrayRef<InstrProfValueData> ICallProfDataRef,
+ uint32_t NumCandidates, VTableGUIDCountsMap &VTableGUIDCounts);
// Promote a list of targets for one indirect call by comparing vtables with
// functions. Return true if there are IR transformations and false
@@ -394,12 +415,15 @@ private:
Constant *getOrCreateVTableAddressPointVar(GlobalVariable *GV,
uint64_t AddressPointOffset);
- void updateFuncValueProfiles(CallBase &CB, ArrayRef<InstrProfValueData> VDs,
+ void updateFuncValueProfiles(CallBase &CB,
+ MutableArrayRef<InstrProfValueData> VDs,
uint64_t Sum, uint32_t MaxMDCount);
void updateVPtrValueProfiles(Instruction *VPtr,
VTableGUIDCountsMap &VTableGUIDCounts);
+ bool isValidTarget(uint64_t, Function *, const CallBase &, uint64_t);
+
public:
IndirectCallPromoter(
Function &Func, Module &M, InstrProfSymtab *Symtab, bool SamplePGO,
@@ -419,6 +443,53 @@ public:
} // end anonymous namespace
+bool IndirectCallPromoter::isValidTarget(uint64_t Target,
+ Function *TargetFunction,
+ const CallBase &CB, uint64_t Count) {
+ // Don't promote if the symbol is not defined in the module. This avoids
+ // creating a reference to a symbol that doesn't exist in the module
+ // This can happen when we compile with a sample profile collected from
+ // one binary but used for another, which may have profiled targets that
+ // aren't used in the new binary. We might have a declaration initially in
+ // the case where the symbol is globally dead in the binary and removed by
+ // ThinLTO.
+ using namespace ore;
+ if (TargetFunction == nullptr) {
+ LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n");
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB)
+ << "Cannot promote indirect call: target with md5sum "
+ << NV("target md5sum", Target)
+ << " not found (count=" << NV("Count", Count) << ")";
+ });
+ return false;
+ }
+ if (!ICPAllowDecls && TargetFunction->isDeclaration()) {
+ LLVM_DEBUG(dbgs() << " Not promote: target definition is not available\n");
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoTargetDef", &CB)
+ << "Do not promote indirect call: target with md5sum "
+ << NV("target md5sum", Target)
+ << " definition not available (count=" << ore::NV("Count", Count)
+ << ")";
+ });
+ return false;
+ }
+
+ const char *Reason = nullptr;
+ if (!isLegalToPromote(CB, TargetFunction, &Reason)) {
+
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", &CB)
+ << "Cannot promote indirect call to "
+ << NV("TargetFunction", TargetFunction)
+ << " (count=" << NV("Count", Count) << "): " << Reason;
+ });
+ return false;
+ }
+ return true;
+}
+
// Indirect-call promotion heuristic. The direct targets are sorted based on
// the count. Stop at the first target that is not promoted.
std::vector<IndirectCallPromoter::PromotionCandidate>
@@ -469,38 +540,15 @@ IndirectCallPromoter::getPromotionCandidatesForCallSite(
break;
}
- // Don't promote if the symbol is not defined in the module. This avoids
- // creating a reference to a symbol that doesn't exist in the module
- // This can happen when we compile with a sample profile collected from
- // one binary but used for another, which may have profiled targets that
- // aren't used in the new binary. We might have a declaration initially in
- // the case where the symbol is globally dead in the binary and removed by
- // ThinLTO.
Function *TargetFunction = Symtab->getFunction(Target);
- if (TargetFunction == nullptr || TargetFunction->isDeclaration()) {
- LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n");
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB)
- << "Cannot promote indirect call: target with md5sum "
- << ore::NV("target md5sum", Target) << " not found";
- });
- break;
- }
-
- const char *Reason = nullptr;
- if (!isLegalToPromote(CB, TargetFunction, &Reason)) {
- using namespace ore;
-
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", &CB)
- << "Cannot promote indirect call to "
- << NV("TargetFunction", TargetFunction) << " with count of "
- << NV("Count", Count) << ": " << Reason;
- });
- break;
+ if (!isValidTarget(Target, TargetFunction, CB, Count)) {
+ if (ICPAllowCandidateSkip)
+ continue;
+ else
+ break;
}
- Ret.push_back(PromotionCandidate(TargetFunction, Count));
+ Ret.push_back(PromotionCandidate(TargetFunction, Count, I));
TotalCount -= Count;
}
return Ret;
@@ -642,7 +690,7 @@ CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
// Promote indirect-call to conditional direct-call for one callsite.
bool IndirectCallPromoter::tryToPromoteWithFuncCmp(
CallBase &CB, Instruction *VPtr, ArrayRef<PromotionCandidate> Candidates,
- uint64_t TotalCount, ArrayRef<InstrProfValueData> ICallProfDataRef,
+ uint64_t TotalCount, MutableArrayRef<InstrProfValueData> ICallProfDataRef,
uint32_t NumCandidates, VTableGUIDCountsMap &VTableGUIDCounts) {
uint32_t NumPromoted = 0;
@@ -655,6 +703,8 @@ bool IndirectCallPromoter::tryToPromoteWithFuncCmp(
NumOfPGOICallPromotion++;
NumPromoted++;
+ // Update the count and this entry will be erased later.
+ ICallProfDataRef[C.Index].Count = 0;
if (!EnableVTableProfileUse || C.VTableGUIDAndCounts.empty())
continue;
@@ -679,21 +729,33 @@ bool IndirectCallPromoter::tryToPromoteWithFuncCmp(
"Number of promoted functions should not be greater than the number "
"of values in profile metadata");
- // Update value profiles on the indirect call.
- updateFuncValueProfiles(CB, ICallProfDataRef.slice(NumPromoted), TotalCount,
- NumCandidates);
+ updateFuncValueProfiles(CB, ICallProfDataRef, TotalCount, NumCandidates);
updateVPtrValueProfiles(VPtr, VTableGUIDCounts);
return true;
}
void IndirectCallPromoter::updateFuncValueProfiles(
- CallBase &CB, ArrayRef<InstrProfValueData> CallVDs, uint64_t TotalCount,
- uint32_t MaxMDCount) {
+ CallBase &CB, MutableArrayRef<InstrProfValueData> CallVDs,
+ uint64_t TotalCount, uint32_t MaxMDCount) {
// First clear the existing !prof.
CB.setMetadata(LLVMContext::MD_prof, nullptr);
+
+ // Sort value profiles by count in descending order.
+ llvm::stable_sort(CallVDs, [](const InstrProfValueData &LHS,
+ const InstrProfValueData &RHS) {
+ return LHS.Count > RHS.Count;
+ });
+ // Drop the <target-value, count> pair if count is zero.
+ ArrayRef<InstrProfValueData> VDs(
+ CallVDs.begin(),
+ llvm::upper_bound(CallVDs, 0U,
+ [](uint64_t Count, const InstrProfValueData &ProfData) {
+ return ProfData.Count <= Count;
+ }));
+
// Annotate the remaining value profiles if counter is not zero.
if (TotalCount != 0)
- annotateValueSite(M, CB, CallVDs, TotalCount, IPVK_IndirectCallTarget,
+ annotateValueSite(M, CB, VDs, TotalCount, IPVK_IndirectCallTarget,
MaxMDCount);
}
@@ -726,7 +788,7 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
uint64_t TotalFuncCount, uint32_t NumCandidates,
MutableArrayRef<InstrProfValueData> ICallProfDataRef,
VTableGUIDCountsMap &VTableGUIDCounts) {
- SmallVector<uint64_t, 4> PromotedFuncCount;
+ SmallVector<std::pair<uint32_t, uint64_t>, 4> PromotedFuncCount;
for (const auto &Candidate : Candidates) {
for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts)
@@ -771,7 +833,7 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
return Remark;
});
- PromotedFuncCount.push_back(Candidate.Count);
+ PromotedFuncCount.push_back({Candidate.Index, Candidate.Count});
assert(TotalFuncCount >= Candidate.Count &&
"Within one prof metadata, total count is the sum of counts from "
@@ -792,22 +854,12 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
// used to load multiple virtual functions. The vtable profiles needs to be
// updated properly in that case (e.g, for each indirect call annotate both
// type profiles and function profiles in one !prof).
- for (size_t I = 0; I < PromotedFuncCount.size(); I++)
- ICallProfDataRef[I].Count -=
- std::max(PromotedFuncCount[I], ICallProfDataRef[I].Count);
- // Sort value profiles by count in descending order.
- llvm::stable_sort(ICallProfDataRef, [](const InstrProfValueData &LHS,
- const InstrProfValueData &RHS) {
- return LHS.Count > RHS.Count;
- });
- // Drop the <target-value, count> pair if count is zero.
- ArrayRef<InstrProfValueData> VDs(
- ICallProfDataRef.begin(),
- llvm::upper_bound(ICallProfDataRef, 0U,
- [](uint64_t Count, const InstrProfValueData &ProfData) {
- return ProfData.Count <= Count;
- }));
- updateFuncValueProfiles(CB, VDs, TotalFuncCount, NumCandidates);
+ for (size_t I = 0; I < PromotedFuncCount.size(); I++) {
+ uint32_t Index = PromotedFuncCount[I].first;
+ ICallProfDataRef[Index].Count -=
+ std::max(PromotedFuncCount[I].second, ICallProfDataRef[Index].Count);
+ }
+ updateFuncValueProfiles(CB, ICallProfDataRef, TotalFuncCount, NumCandidates);
updateVPtrValueProfiles(VPtr, VTableGUIDCounts);
return true;
}
@@ -822,9 +874,22 @@ bool IndirectCallPromoter::processFunction(ProfileSummaryInfo *PSI) {
uint64_t TotalCount;
auto ICallProfDataRef = ICallAnalysis.getPromotionCandidatesForInstruction(
CB, TotalCount, NumCandidates);
- if (!NumCandidates ||
- (PSI && PSI->hasProfileSummary() && !PSI->isHotCount(TotalCount)))
+ if (!NumCandidates)
continue;
+ if (PSI && PSI->hasProfileSummary()) {
+ // Don't promote cold candidates.
+ if (PSI->isColdCount(TotalCount)) {
+ LLVM_DEBUG(dbgs() << "Don't promote the cold candidate: TotalCount="
+ << TotalCount << "\n");
+ continue;
+ }
+ // Only pormote hot if ICPAllowHotOnly is true.
+ if (ICPAllowHotOnly && !PSI->isHotCount(TotalCount)) {
+ LLVM_DEBUG(dbgs() << "Don't promote the non-hot candidate: TotalCount="
+ << TotalCount << "\n");
+ continue;
+ }
+ }
auto PromotionCandidates = getPromotionCandidatesForCallSite(
*CB, ICallProfDataRef, TotalCount, NumCandidates);
diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
index 55f3239..2486e77 100644
--- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
@@ -72,7 +72,7 @@ static void emitRemark(IntrinsicInst *II, OptimizationRemarkEmitter &ORE,
}
}
-static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
+static bool lowerAllowChecks(Function &F, const BlockFrequencyInfo &BFI,
const ProfileSummaryInfo *PSI,
OptimizationRemarkEmitter &ORE,
const LowerAllowCheckPass::Options &Opts) {
@@ -160,7 +160,7 @@ PreservedAnalyses LowerAllowCheckPass::run(Function &F,
OptimizationRemarkEmitter &ORE =
AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- return removeUbsanTraps(F, BFI, PSI, ORE, Opts)
+ return lowerAllowChecks(F, BFI, PSI, ORE, Opts)
// We do not change the CFG, we only replace the intrinsics with
// true or false.
? PreservedAnalyses::none().preserveSet<CFGAnalyses>()
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index e5b357f..a9a0731 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -361,6 +361,131 @@ static void addVPMetadata(Module &M, Instruction &I,
}
}
+static void
+handleAllocSite(Instruction &I, CallBase *CI,
+ ArrayRef<uint64_t> InlinedCallStack, LLVMContext &Ctx,
+ OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize,
+ const std::set<const AllocationInfo *> &AllocInfoSet,
+ std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo>
+ &FullStackIdToAllocMatchInfo) {
+ // We may match this instruction's location list to multiple MIB
+ // contexts. Add them to a Trie specialized for trimming the contexts to
+ // the minimal needed to disambiguate contexts with unique behavior.
+ CallStackTrie AllocTrie(&ORE, MaxColdSize);
+ uint64_t TotalSize = 0;
+ uint64_t TotalColdSize = 0;
+ for (auto *AllocInfo : AllocInfoSet) {
+ // Check the full inlined call stack against this one.
+ // If we found and thus matched all frames on the call, include
+ // this MIB.
+ if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
+ InlinedCallStack)) {
+ NumOfMemProfMatchedAllocContexts++;
+ uint64_t FullStackId = 0;
+ if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis())
+ FullStackId = computeFullStackId(AllocInfo->CallStack);
+ auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
+ TotalSize += AllocInfo->Info.getTotalSize();
+ if (AllocType == AllocationType::Cold)
+ TotalColdSize += AllocInfo->Info.getTotalSize();
+ // Record information about the allocation if match info printing
+ // was requested.
+ if (ClPrintMemProfMatchInfo) {
+ assert(FullStackId != 0);
+ FullStackIdToAllocMatchInfo[std::make_pair(FullStackId,
+ InlinedCallStack.size())] = {
+ AllocInfo->Info.getTotalSize(), AllocType};
+ }
+ }
+ }
+ // If the threshold for the percent of cold bytes is less than 100%,
+ // and not all bytes are cold, see if we should still hint this
+ // allocation as cold without context sensitivity.
+ if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 &&
+ TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) {
+ AllocTrie.addSingleAllocTypeAttribute(CI, AllocationType::Cold, "dominant");
+ return;
+ }
+
+ // We might not have matched any to the full inlined call stack.
+ // But if we did, create and attach metadata, or a function attribute if
+ // all contexts have identical profiled behavior.
+ if (!AllocTrie.empty()) {
+ NumOfMemProfMatchedAllocs++;
+ // MemprofMDAttached will be false if a function attribute was
+ // attached.
+ bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI);
+ assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof));
+ if (MemprofMDAttached) {
+ // Add callsite metadata for the instruction's location list so that
+ // it simpler later on to identify which part of the MIB contexts
+ // are from this particular instruction (including during inlining,
+ // when the callsite metadata will be updated appropriately).
+ // FIXME: can this be changed to strip out the matching stack
+ // context ids from the MIB contexts and not add any callsite
+ // metadata here to save space?
+ addCallsiteMetadata(I, InlinedCallStack, Ctx);
+ }
+ }
+}
+
+// Helper struct for maintaining refs to callsite data. As an alternative we
+// could store a pointer to the CallSiteInfo struct but we also need the frame
+// index. Using ArrayRefs instead makes it a little easier to read.
+struct CallSiteEntry {
+ // Subset of frames for the corresponding CallSiteInfo.
+ ArrayRef<Frame> Frames;
+ // Potential targets for indirect calls.
+ ArrayRef<GlobalValue::GUID> CalleeGuids;
+
+ // Only compare Frame contents.
+ // Use pointer-based equality instead of ArrayRef's operator== which does
+ // element-wise comparison. We want to check if it's the same slice of the
+ // underlying array, not just equivalent content.
+ bool operator==(const CallSiteEntry &Other) const {
+ return Frames.data() == Other.Frames.data() &&
+ Frames.size() == Other.Frames.size();
+ }
+};
+
+struct CallSiteEntryHash {
+ size_t operator()(const CallSiteEntry &Entry) const {
+ return computeFullStackId(Entry.Frames);
+ }
+};
+
+static void handleCallSite(
+ Instruction &I, const Function *CalledFunction,
+ ArrayRef<uint64_t> InlinedCallStack,
+ const std::unordered_set<CallSiteEntry, CallSiteEntryHash> &CallSiteEntries,
+ Module &M, std::set<std::vector<uint64_t>> &MatchedCallSites) {
+ auto &Ctx = M.getContext();
+ for (const auto &CallSiteEntry : CallSiteEntries) {
+ // If we found and thus matched all frames on the call, create and
+ // attach call stack metadata.
+ if (stackFrameIncludesInlinedCallStack(CallSiteEntry.Frames,
+ InlinedCallStack)) {
+ NumOfMemProfMatchedCallSites++;
+ addCallsiteMetadata(I, InlinedCallStack, Ctx);
+
+ // Try to attach indirect call metadata if possible.
+ if (!CalledFunction)
+ addVPMetadata(M, I, CallSiteEntry.CalleeGuids);
+
+ // Only need to find one with a matching call stack and add a single
+ // callsite metadata.
+
+ // Accumulate call site matching information upon request.
+ if (ClPrintMemProfMatchInfo) {
+ std::vector<uint64_t> CallStack;
+ append_range(CallStack, InlinedCallStack);
+ MatchedCallSites.insert(std::move(CallStack));
+ }
+ break;
+ }
+ }
+}
+
static void readMemprof(Module &M, Function &F,
IndexedInstrProfReader *MemProfReader,
const TargetLibraryInfo &TLI,
@@ -431,31 +556,6 @@ static void readMemprof(Module &M, Function &F,
// (allocation info and the callsites).
std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo;
- // Helper struct for maintaining refs to callsite data. As an alternative we
- // could store a pointer to the CallSiteInfo struct but we also need the frame
- // index. Using ArrayRefs instead makes it a little easier to read.
- struct CallSiteEntry {
- // Subset of frames for the corresponding CallSiteInfo.
- ArrayRef<Frame> Frames;
- // Potential targets for indirect calls.
- ArrayRef<GlobalValue::GUID> CalleeGuids;
-
- // Only compare Frame contents.
- // Use pointer-based equality instead of ArrayRef's operator== which does
- // element-wise comparison. We want to check if it's the same slice of the
- // underlying array, not just equivalent content.
- bool operator==(const CallSiteEntry &Other) const {
- return Frames.data() == Other.Frames.data() &&
- Frames.size() == Other.Frames.size();
- }
- };
-
- struct CallSiteEntryHash {
- size_t operator()(const CallSiteEntry &Entry) const {
- return computeFullStackId(Entry.Frames);
- }
- };
-
// For the callsites we need to record slices of the frame array (see comments
// below where the map entries are added) along with their CalleeGuids.
std::map<uint64_t, std::unordered_set<CallSiteEntry, CallSiteEntryHash>>
@@ -553,100 +653,15 @@ static void readMemprof(Module &M, Function &F,
// allocation context with the same leaf.
if (AllocInfoIter != LocHashToAllocInfo.end() &&
// Only consider allocations which support hinting.
- isAllocationWithHotColdVariant(CI->getCalledFunction(), TLI)) {
- // We may match this instruction's location list to multiple MIB
- // contexts. Add them to a Trie specialized for trimming the contexts to
- // the minimal needed to disambiguate contexts with unique behavior.
- CallStackTrie AllocTrie(&ORE, MaxColdSize);
- uint64_t TotalSize = 0;
- uint64_t TotalColdSize = 0;
- for (auto *AllocInfo : AllocInfoIter->second) {
- // Check the full inlined call stack against this one.
- // If we found and thus matched all frames on the call, include
- // this MIB.
- if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
- InlinedCallStack)) {
- NumOfMemProfMatchedAllocContexts++;
- uint64_t FullStackId = 0;
- if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis())
- FullStackId = computeFullStackId(AllocInfo->CallStack);
- auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
- TotalSize += AllocInfo->Info.getTotalSize();
- if (AllocType == AllocationType::Cold)
- TotalColdSize += AllocInfo->Info.getTotalSize();
- // Record information about the allocation if match info printing
- // was requested.
- if (ClPrintMemProfMatchInfo) {
- assert(FullStackId != 0);
- FullStackIdToAllocMatchInfo[std::make_pair(
- FullStackId, InlinedCallStack.size())] = {
- AllocInfo->Info.getTotalSize(), AllocType};
- }
- }
- }
- // If the threshold for the percent of cold bytes is less than 100%,
- // and not all bytes are cold, see if we should still hint this
- // allocation as cold without context sensitivity.
- if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 &&
- TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) {
- AllocTrie.addSingleAllocTypeAttribute(CI, AllocationType::Cold,
- "dominant");
- continue;
- }
-
- // We might not have matched any to the full inlined call stack.
- // But if we did, create and attach metadata, or a function attribute if
- // all contexts have identical profiled behavior.
- if (!AllocTrie.empty()) {
- NumOfMemProfMatchedAllocs++;
- // MemprofMDAttached will be false if a function attribute was
- // attached.
- bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI);
- assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof));
- if (MemprofMDAttached) {
- // Add callsite metadata for the instruction's location list so that
- // it simpler later on to identify which part of the MIB contexts
- // are from this particular instruction (including during inlining,
- // when the callsite metadata will be updated appropriately).
- // FIXME: can this be changed to strip out the matching stack
- // context ids from the MIB contexts and not add any callsite
- // metadata here to save space?
- addCallsiteMetadata(I, InlinedCallStack, Ctx);
- }
- }
- continue;
- }
-
- if (CallSitesIter == LocHashToCallSites.end())
- continue;
-
- // Otherwise, add callsite metadata. If we reach here then we found the
- // instruction's leaf location in the callsites map and not the allocation
- // map.
- for (const auto &CallSiteEntry : CallSitesIter->second) {
- // If we found and thus matched all frames on the call, create and
- // attach call stack metadata.
- if (stackFrameIncludesInlinedCallStack(CallSiteEntry.Frames,
- InlinedCallStack)) {
- NumOfMemProfMatchedCallSites++;
- addCallsiteMetadata(I, InlinedCallStack, Ctx);
-
- // Try to attach indirect call metadata if possible.
- if (!CalledFunction)
- addVPMetadata(M, I, CallSiteEntry.CalleeGuids);
-
- // Only need to find one with a matching call stack and add a single
- // callsite metadata.
-
- // Accumulate call site matching information upon request.
- if (ClPrintMemProfMatchInfo) {
- std::vector<uint64_t> CallStack;
- append_range(CallStack, InlinedCallStack);
- MatchedCallSites.insert(std::move(CallStack));
- }
- break;
- }
- }
+ isAllocationWithHotColdVariant(CI->getCalledFunction(), TLI))
+ handleAllocSite(I, CI, InlinedCallStack, Ctx, ORE, MaxColdSize,
+ AllocInfoIter->second, FullStackIdToAllocMatchInfo);
+ else if (CallSitesIter != LocHashToCallSites.end())
+ // Otherwise, add callsite metadata. If we reach here then we found the
+ // instruction's leaf location in the callsites map and not the
+ // allocation map.
+ handleCallSite(I, CalledFunction, InlinedCallStack,
+ CallSitesIter->second, M, MatchedCallSites);
}
}
}
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 7b58316..df31f07 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -158,7 +158,6 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/bit.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -1216,7 +1215,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
};
SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
DenseMap<const DILocation *, int> LazyWarningDebugLocationCount;
- bool InstrumentLifetimeStart = ClHandleLifetimeIntrinsics;
SmallSetVector<AllocaInst *, 16> AllocaSet;
SmallVector<std::pair<IntrinsicInst *, AllocaInst *>, 16> LifetimeStartList;
SmallVector<StoreInst *, 16> StoreList;
@@ -1623,7 +1621,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// Poison llvm.lifetime.start intrinsics, if we haven't fallen back to
// instrumenting only allocas.
- if (InstrumentLifetimeStart) {
+ if (ClHandleLifetimeIntrinsics) {
for (auto Item : LifetimeStartList) {
instrumentAlloca(*Item.second, Item.first);
AllocaSet.remove(Item.second);
@@ -3303,9 +3301,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
void handleLifetimeStart(IntrinsicInst &I) {
if (!PoisonStack)
return;
- AllocaInst *AI = llvm::findAllocaForValue(I.getArgOperand(1));
- if (!AI)
- InstrumentLifetimeStart = false;
+ AllocaInst *AI = cast<AllocaInst>(I.getArgOperand(1));
LifetimeStartList.push_back(std::make_pair(&I, AI));
}
diff --git a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index 3fa844e..6135c7b 100644
--- a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -46,6 +46,8 @@ enum class ARCRuntimeEntryPointKind {
UnsafeClaimRV,
RetainAutorelease,
RetainAutoreleaseRV,
+ AutoreleasePoolPush,
+ AutoreleasePoolPop,
};
/// Declarations for ObjC runtime functions and constants. These are initialized
@@ -67,6 +69,8 @@ public:
UnsafeClaimRV = nullptr;
RetainAutorelease = nullptr;
RetainAutoreleaseRV = nullptr;
+ AutoreleasePoolPush = nullptr;
+ AutoreleasePoolPop = nullptr;
}
Function *get(ARCRuntimeEntryPointKind kind) {
@@ -101,6 +105,12 @@ public:
case ARCRuntimeEntryPointKind::RetainAutoreleaseRV:
return getIntrinsicEntryPoint(RetainAutoreleaseRV,
Intrinsic::objc_retainAutoreleaseReturnValue);
+ case ARCRuntimeEntryPointKind::AutoreleasePoolPush:
+ return getIntrinsicEntryPoint(AutoreleasePoolPush,
+ Intrinsic::objc_autoreleasePoolPush);
+ case ARCRuntimeEntryPointKind::AutoreleasePoolPop:
+ return getIntrinsicEntryPoint(AutoreleasePoolPop,
+ Intrinsic::objc_autoreleasePoolPop);
}
llvm_unreachable("Switch should be a covered switch.");
@@ -143,6 +153,12 @@ private:
/// Declaration for objc_retainAutoreleaseReturnValue().
Function *RetainAutoreleaseRV = nullptr;
+ /// Declaration for objc_autoreleasePoolPush().
+ Function *AutoreleasePoolPush = nullptr;
+
+ /// Declaration for objc_autoreleasePoolPop().
+ Function *AutoreleasePoolPop = nullptr;
+
Function *getIntrinsicEntryPoint(Function *&Decl, Intrinsic::ID IntID) {
if (Decl)
return Decl;
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 5eb3f51..66a2c76 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -39,6 +39,7 @@
#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
#include "llvm/Analysis/ObjCARCInstKind.h"
#include "llvm/Analysis/ObjCARCUtil.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
@@ -132,11 +133,8 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
//
// The second retain and autorelease can be deleted.
-// TODO: It should be possible to delete
-// objc_autoreleasePoolPush and objc_autoreleasePoolPop
-// pairs if nothing is actually autoreleased between them. Also, autorelease
-// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
-// after inlining) can be turned into plain release calls.
+// TODO: Autorelease calls followed by objc_autoreleasePoolPop calls (perhaps in
+// ObjC++ code after inlining) can be turned into plain release calls.
// TODO: Critical-edge splitting. If the optimial insertion point is
// a critical edge, the current algorithm has to fail, because it doesn't
@@ -566,6 +564,8 @@ class ObjCARCOpt {
void OptimizeReturns(Function &F);
+ void OptimizeAutoreleasePools(Function &F);
+
template <typename PredicateT>
static void cloneOpBundlesIf(CallBase *CI,
SmallVectorImpl<OperandBundleDef> &OpBundles,
@@ -2473,6 +2473,11 @@ bool ObjCARCOpt::run(Function &F, AAResults &AA) {
(1 << unsigned(ARCInstKind::AutoreleaseRV))))
OptimizeReturns(F);
+ // Optimizations for autorelease pools.
+ if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::AutoreleasepoolPush)) |
+ (1 << unsigned(ARCInstKind::AutoreleasepoolPop))))
+ OptimizeAutoreleasePools(F);
+
// Gather statistics after optimization.
#ifndef NDEBUG
if (AreStatisticsEnabled()) {
@@ -2485,6 +2490,183 @@ bool ObjCARCOpt::run(Function &F, AAResults &AA) {
return Changed;
}
+/// Interprocedurally determine if calls made by the given call site can
+/// possibly produce autoreleases.
+bool MayAutorelease(const CallBase &CB, unsigned Depth = 0) {
+ if (CB.onlyReadsMemory())
+ return false;
+
+ // This recursion depth limit is arbitrary. It's just great
+ // enough to cover known interesting testcases.
+ if (Depth > 5)
+ return true;
+
+ if (const Function *Callee = CB.getCalledFunction()) {
+ if (!Callee->hasExactDefinition())
+ return true;
+ for (const BasicBlock &BB : *Callee) {
+ for (const Instruction &I : BB) {
+ // TODO: Ignore all instructions between autorelease pools
+ ARCInstKind InstKind = GetBasicARCInstKind(&I);
+ switch (InstKind) {
+ case ARCInstKind::Autorelease:
+ case ARCInstKind::AutoreleaseRV:
+ case ARCInstKind::FusedRetainAutorelease:
+ case ARCInstKind::FusedRetainAutoreleaseRV:
+ case ARCInstKind::LoadWeak:
+ // These may produce autoreleases
+ return true;
+
+ case ARCInstKind::Retain:
+ case ARCInstKind::RetainRV:
+ case ARCInstKind::UnsafeClaimRV:
+ case ARCInstKind::RetainBlock:
+ case ARCInstKind::Release:
+ case ARCInstKind::NoopCast:
+ case ARCInstKind::LoadWeakRetained:
+ case ARCInstKind::StoreWeak:
+ case ARCInstKind::InitWeak:
+ case ARCInstKind::MoveWeak:
+ case ARCInstKind::CopyWeak:
+ case ARCInstKind::DestroyWeak:
+ case ARCInstKind::StoreStrong:
+ case ARCInstKind::AutoreleasepoolPush:
+ case ARCInstKind::AutoreleasepoolPop:
+ // These ObjC runtime functions don't produce autoreleases
+ break;
+
+ case ARCInstKind::CallOrUser:
+ case ARCInstKind::Call:
+ // For non-ObjC function calls, recursively analyze
+ if (MayAutorelease(cast<CallBase>(I), Depth + 1))
+ return true;
+ break;
+
+ case ARCInstKind::IntrinsicUser:
+ case ARCInstKind::User:
+ case ARCInstKind::None:
+ // These are not relevant for autorelease analysis
+ break;
+ }
+ }
+ }
+ return false;
+ }
+
+ return true;
+}
+
+/// Optimize autorelease pools by eliminating empty push/pop pairs.
+void ObjCARCOpt::OptimizeAutoreleasePools(Function &F) {
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeAutoreleasePools ==\n");
+
+ OptimizationRemarkEmitter ORE(&F);
+
+ // Process each basic block independently.
+ // TODO: Can we optimize inter-block autorelease pool pairs?
+ // This would involve tracking autorelease pool state across blocks.
+ for (BasicBlock &BB : F) {
+ // Use a stack to track nested autorelease pools
+ SmallVector<std::pair<CallInst *, bool>, 4>
+ PoolStack; // {push_inst, has_autorelease_in_scope}
+
+ for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
+ ARCInstKind Class = GetBasicARCInstKind(&Inst);
+
+ switch (Class) {
+ case ARCInstKind::AutoreleasepoolPush: {
+ // Start tracking a new autorelease pool scope
+ auto *Push = cast<CallInst>(&Inst);
+ PoolStack.push_back(
+ {Push, false}); // {push_inst, has_autorelease_in_scope}
+ LLVM_DEBUG(dbgs() << "Found autorelease pool push: " << *Push << "\n");
+ break;
+ }
+
+ case ARCInstKind::AutoreleasepoolPop: {
+ auto *Pop = cast<CallInst>(&Inst);
+
+ if (PoolStack.empty())
+ break;
+
+ auto &TopPool = PoolStack.back();
+ CallInst *PendingPush = TopPool.first;
+ bool HasAutoreleaseInScope = TopPool.second;
+
+ // Pop the stack - remove this pool scope
+ PoolStack.pop_back();
+
+ // Bail if this pop doesn't match the pending push
+ if (Pop->getArgOperand(0)->stripPointerCasts() != PendingPush)
+ break;
+
+ // Bail if there were autoreleases in this scope
+ if (HasAutoreleaseInScope)
+ break;
+
+ // Optimize: eliminate this empty autorelease pool pair
+ ORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "AutoreleasePoolElimination",
+ PendingPush)
+ << "eliminated empty autorelease pool pair";
+ });
+
+ // Replace all uses of push with poison before deletion
+ PendingPush->replaceAllUsesWith(
+ PoisonValue::get(PendingPush->getType()));
+
+ Pop->eraseFromParent();
+ PendingPush->eraseFromParent();
+
+ Changed = true;
+ ++NumNoops;
+ break;
+ }
+ case ARCInstKind::CallOrUser:
+ case ARCInstKind::Call:
+ if (!MayAutorelease(cast<CallBase>(Inst)))
+ break;
+ LLVM_FALLTHROUGH;
+ case ARCInstKind::Autorelease:
+ case ARCInstKind::AutoreleaseRV:
+ case ARCInstKind::FusedRetainAutorelease:
+ case ARCInstKind::FusedRetainAutoreleaseRV:
+ case ARCInstKind::LoadWeak: {
+ // Track that we have autorelease calls in the current pool scope
+ if (!PoolStack.empty()) {
+ PoolStack.back().second = true; // Set has_autorelease_in_scope = true
+ LLVM_DEBUG(
+ dbgs()
+ << "Found autorelease or potential autorelease in pool scope: "
+ << Inst << "\n");
+ }
+ break;
+ }
+
+ // Enumerate all remaining ARCInstKind cases explicitly
+ case ARCInstKind::Retain:
+ case ARCInstKind::RetainRV:
+ case ARCInstKind::UnsafeClaimRV:
+ case ARCInstKind::RetainBlock:
+ case ARCInstKind::Release:
+ case ARCInstKind::NoopCast:
+ case ARCInstKind::LoadWeakRetained:
+ case ARCInstKind::StoreWeak:
+ case ARCInstKind::InitWeak:
+ case ARCInstKind::MoveWeak:
+ case ARCInstKind::CopyWeak:
+ case ARCInstKind::DestroyWeak:
+ case ARCInstKind::StoreStrong:
+ case ARCInstKind::IntrinsicUser:
+ case ARCInstKind::User:
+ case ARCInstKind::None:
+ // These instruction kinds don't affect autorelease pool optimization
+ break;
+ }
+ }
+ }
+}
+
/// @}
///
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index df31602..1ddb8ae 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1486,10 +1486,8 @@ static bool checkAndReplaceCondition(
// Update the debug value records that satisfy the same condition used
// in replaceUsesWithIf.
- SmallVector<DbgVariableIntrinsic *> DbgUsers;
SmallVector<DbgVariableRecord *> DVRUsers;
- findDbgUsers(DbgUsers, Cmp, &DVRUsers);
- assert(DbgUsers.empty());
+ findDbgUsers(Cmp, DVRUsers);
for (auto *DVR : DVRUsers) {
auto *DTN = DT.getNode(DVR->getParent());
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 66836ef..85ee824 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -430,6 +430,8 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II,
}
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end: {
+ // Always force lifetime markers to work directly on the alloca.
+ NewV = NewV->stripPointerCasts();
Function *NewDecl = Intrinsic::getOrInsertDeclaration(
M, II->getIntrinsicID(), {NewV->getType()});
II->setArgOperand(1, NewV);
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 4d1f4407..c2a737d 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1960,7 +1960,6 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB,
// PHI insertion, of which we are prepared to do, clean these up now.
SSAUpdater SSAUpdate;
SmallVector<Use *, 16> UsesToRename;
- SmallVector<DbgValueInst *, 4> DbgValues;
SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
for (Instruction &I : *BB) {
@@ -1978,8 +1977,7 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB,
}
// Find debug values outside of the block
- findDbgValues(DbgValues, &I, &DbgVariableRecords);
- assert(DbgValues.empty());
+ findDbgValues(&I, DbgVariableRecords);
llvm::erase_if(DbgVariableRecords, [&](const DbgVariableRecord *DbgVarRec) {
return DbgVarRec->getParent() == BB;
});
@@ -2000,7 +1998,6 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB,
SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
if (!DbgVariableRecords.empty()) {
SSAUpdate.UpdateDebugValues(&I, DbgVariableRecords);
- DbgValues.clear();
DbgVariableRecords.clear();
}
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 8c84b0d..03b92d3 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -88,7 +88,6 @@
#include <cassert>
#include <cstdint>
#include <utility>
-#include <vector>
using namespace llvm;
using namespace SCEVPatternMatch;
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 221094f..b9546c5 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -128,6 +128,8 @@ private:
// from any other block. So this variable set to true means that loop's latch
// has become unreachable from loop header.
bool DeleteCurrentLoop = false;
+ // Whether or not we enter the loop through an indirectbr.
+ bool HasIndirectEntry = false;
// The blocks of the original loop that will still be reachable from entry
// after the constant folding.
@@ -216,6 +218,19 @@ private:
return;
}
+ // We need a loop preheader to split in handleDeadExits(). If LoopSimplify
+ // wasn't able to form one because the loop can be entered through an
+ // indirectbr we cannot continue.
+ if (!L.getLoopPreheader()) {
+ assert(any_of(predecessors(L.getHeader()),
+ [&](BasicBlock *Pred) {
+ return isa<IndirectBrInst>(Pred->getTerminator());
+ }) &&
+ "Loop should have preheader if it is not entered indirectly");
+ HasIndirectEntry = true;
+ return;
+ }
+
// Collect live and dead loop blocks and exits.
LiveLoopBlocks.insert(L.getHeader());
for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) {
@@ -546,6 +561,12 @@ public:
return false;
}
+ if (HasIndirectEntry) {
+ LLVM_DEBUG(dbgs() << "Loops which can be entered indirectly are not"
+ " supported!\n");
+ return false;
+ }
+
// Nothing to constant-fold.
if (FoldCandidates.empty()) {
LLVM_DEBUG(
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 9e318b0..e3ef9d8 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -3785,7 +3785,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
// Ignore icmp instructions which are already being analyzed.
if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
unsigned OtherIdx = !U.getOperandNo();
- Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
+ Value *OtherOp = ICI->getOperand(OtherIdx);
if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
continue;
}
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 7eeaaa0..6a3f656 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -82,6 +82,7 @@
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InstrTypes.h"
@@ -3044,6 +3045,7 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
if (isInstructionTriviallyDead(&I, TLI)) {
InstrDFS[&I] = 0;
LLVM_DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+ salvageDebugInfo(I);
markInstructionForDeletion(&I);
continue;
}
@@ -4076,6 +4078,12 @@ bool NewGVN::eliminateInstructions(Function &F) {
if (!match(DefI, m_Intrinsic<Intrinsic::ssa_copy>()))
patchReplacementInstruction(DefI, DominatingLeader);
+ SmallVector<DbgVariableRecord *> DVRUsers;
+ findDbgUsers(DefI, DVRUsers);
+
+ for (auto *DVR : DVRUsers)
+ DVR->replaceVariableLocationOp(DefI, DominatingLeader);
+
markInstructionForDeletion(DefI);
}
}
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 820c8e1..aae5d60 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -458,8 +458,10 @@ bool ScalarizerVisitor::visit(Function &F) {
Instruction *I = &*II;
bool Done = InstVisitor::visit(I);
++II;
- if (Done && I->getType()->isVoidTy())
+ if (Done && I->getType()->isVoidTy()) {
I->eraseFromParent();
+ Scalarized = true;
+ }
}
}
return finish();
@@ -1105,7 +1107,9 @@ bool ScalarizerVisitor::visitExtractValueInst(ExtractValueInst &EVI) {
Res.push_back(ResElem);
}
- gather(&EVI, Res, *VS);
+ Type *ActualVecType = cast<FixedVectorType>(OpTy->getContainedType(Index));
+ std::optional<VectorSplit> AVS = getVectorSplit(ActualVecType);
+ gather(&EVI, Res, *AVS);
return true;
}
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 9b40fc0..f6959ca2 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2144,9 +2144,23 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName,
bool CurrentLoopValid, bool PartiallyInvariant,
bool InjectedCondition, ArrayRef<Loop *> NewLoops) {
- // If we did a non-trivial unswitch, we have added new (cloned) loops.
- if (!NewLoops.empty())
+ auto RecordLoopAsUnswitched = [&](Loop *TargetLoop, StringRef Tag,
+ StringRef DisableTag) {
+ auto &Ctx = TargetLoop->getHeader()->getContext();
+ MDNode *DisableMD = MDNode::get(Ctx, MDString::get(Ctx, DisableTag));
+ MDNode *NewLoopID = makePostTransformationMetadata(
+ Ctx, TargetLoop->getLoopID(), {Tag}, {DisableMD});
+ TargetLoop->setLoopID(NewLoopID);
+ };
+
+ // If we performed a non-trivial unswitch, we have added new cloned loops.
+ // Mark such newly-created loops as visited.
+ if (!NewLoops.empty()) {
+ for (Loop *NL : NewLoops)
+ RecordLoopAsUnswitched(NL, "llvm.loop.unswitch.nontrivial",
+ "llvm.loop.unswitch.nontrivial.disable");
U.addSiblingLoops(NewLoops);
+ }
// If the current loop remains valid, we should revisit it to catch any
// other unswitch opportunities. Otherwise, we need to mark it as deleted.
@@ -2154,24 +2168,12 @@ void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName,
if (PartiallyInvariant) {
// Mark the new loop as partially unswitched, to avoid unswitching on
// the same condition again.
- auto &Context = L.getHeader()->getContext();
- MDNode *DisableUnswitchMD = MDNode::get(
- Context,
- MDString::get(Context, "llvm.loop.unswitch.partial.disable"));
- MDNode *NewLoopID = makePostTransformationMetadata(
- Context, L.getLoopID(), {"llvm.loop.unswitch.partial"},
- {DisableUnswitchMD});
- L.setLoopID(NewLoopID);
+ RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.partial",
+ "llvm.loop.unswitch.partial.disable");
} else if (InjectedCondition) {
// Do the same for injection of invariant conditions.
- auto &Context = L.getHeader()->getContext();
- MDNode *DisableUnswitchMD = MDNode::get(
- Context,
- MDString::get(Context, "llvm.loop.unswitch.injection.disable"));
- MDNode *NewLoopID = makePostTransformationMetadata(
- Context, L.getLoopID(), {"llvm.loop.unswitch.injection"},
- {DisableUnswitchMD});
- L.setLoopID(NewLoopID);
+ RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.injection",
+ "llvm.loop.unswitch.injection.disable");
} else
U.revisitCurrentLoop();
} else
@@ -2809,9 +2811,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
}
/// Cost multiplier is a way to limit potentially exponential behavior
-/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
-/// candidates available. Also accounting for the number of "sibling" loops with
-/// the idea to account for previous unswitches that already happened on this
+/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch
+/// candidates available. Also consider the number of "sibling" loops with
+/// the idea of accounting for previous unswitches that already happened on this
/// cluster of loops. There was an attempt to keep this formula simple,
/// just enough to limit the worst case behavior. Even if it is not that simple
/// now it is still not an attempt to provide a detailed heuristic size
@@ -3507,8 +3509,9 @@ static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates;
IVConditionInfo PartialIVInfo;
Instruction *PartialIVCondBranch = nullptr;
- collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo,
- PartialIVCondBranch, L, LI, AA, MSSAU);
+ if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.nontrivial.disable"))
+ collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo,
+ PartialIVCondBranch, L, LI, AA, MSSAU);
if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.injection.disable"))
collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo,
PartialIVCondBranch, L, DT, LI, AA,
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 7828571..1d83ddc 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -343,8 +343,7 @@ static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
///
static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
- if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
- llvm::findAllocaForValue(II->getArgOperand(1)))
+ if (II->getIntrinsicID() == Intrinsic::lifetime_end)
return true;
// FIXME: We can move load/store/call/free instructions above the call if the
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index f7e66ec..a4fa0e2 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -68,6 +68,7 @@ add_llvm_component_library(LLVMTransformUtils
MoveAutoInit.cpp
NameAnonGlobals.cpp
PredicateInfo.cpp
+ ProfileVerify.cpp
PromoteMemoryToRegister.cpp
RelLookupTableConverter.cpp
ScalarEvolutionExpander.cpp
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 1d1af42..7a9dd37 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1219,10 +1219,8 @@ void CodeExtractor::calculateNewCallTerminatorWeights(
/// \p F.
static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) {
for (Instruction &I : instructions(F)) {
- SmallVector<DbgVariableIntrinsic *, 4> DbgUsers;
SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
- findDbgUsers(DbgUsers, &I, &DbgVariableRecords);
- assert(DbgUsers.empty());
+ findDbgUsers(&I, DbgVariableRecords);
for (DbgVariableRecord *DVR : DbgVariableRecords)
if (DVR->getFunction() != &F)
DVR->eraseFromParent();
@@ -1284,10 +1282,8 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
NewFunc.getEntryBlock().getTerminator()->getIterator());
};
for (auto [Input, NewVal] : zip_equal(Inputs, NewValues)) {
- SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
SmallVector<DbgVariableRecord *, 1> DPUsers;
- findDbgUsers(DbgUsers, Input, &DPUsers);
- assert(DbgUsers.empty());
+ findDbgUsers(Input, DPUsers);
DIExpression *Expr = DIB.createExpression();
// Iterate the debud users of the Input values. If they are in the extracted
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 4210ce6..291e2a5 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -22,7 +22,6 @@
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassInstrumentation.h"
#include "llvm/Pass.h"
diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index c3c3cdf..a9e08ad 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -41,7 +41,6 @@
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PredIteratorCache.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -243,26 +242,10 @@ formLCSSAForInstructionsImpl(SmallVectorImpl<Instruction *> &Worklist,
SSAUpdate.RewriteUse(*UseToRewrite);
}
- SmallVector<DbgValueInst *, 4> DbgValues;
SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
- llvm::findDbgValues(DbgValues, I, &DbgVariableRecords);
+ llvm::findDbgValues(I, DbgVariableRecords);
// Update pre-existing debug value uses that reside outside the loop.
- for (auto *DVI : DbgValues) {
- BasicBlock *UserBB = DVI->getParent();
- if (InstBB == UserBB || L->contains(UserBB))
- continue;
- // We currently only handle debug values residing in blocks that were
- // traversed while rewriting the uses. If we inserted just a single PHI,
- // we will handle all relevant debug values.
- Value *V = AddedPHIs.size() == 1 ? AddedPHIs[0]
- : SSAUpdate.FindValueForBlock(UserBB);
- if (V)
- DVI->replaceVariableLocationOp(I, V);
- }
-
- // RemoveDIs: copy-paste of block above, using non-instruction debug-info
- // records.
for (DbgVariableRecord *DVR : DbgVariableRecords) {
BasicBlock *UserBB = DVR->getMarker()->getParent();
if (InstBB == UserBB || L->contains(UserBB))
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 7f0c23b..babd7f6 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -482,16 +482,11 @@ bool llvm::wouldInstructionBeTriviallyDead(const Instruction *I,
if (II->isLifetimeStartOrEnd()) {
auto *Arg = II->getArgOperand(1);
- // Lifetime intrinsics are dead when their right-hand is undef.
- if (isa<UndefValue>(Arg))
- return true;
- // If the right-hand is an alloc, global, or argument and the only uses
- // are lifetime intrinsics then the intrinsics are dead.
- if (isa<AllocaInst>(Arg) || isa<GlobalValue>(Arg) || isa<Argument>(Arg))
- return llvm::all_of(Arg->uses(), [](Use &Use) {
- return isa<LifetimeIntrinsic>(Use.getUser());
- });
- return false;
+ // If the only uses of the alloca are lifetime intrinsics, then the
+ // intrinsics are dead.
+ return llvm::all_of(Arg->uses(), [](Use &Use) {
+ return isa<LifetimeIntrinsic>(Use.getUser());
+ });
}
// Assumptions are dead if their condition is trivially true.
@@ -610,10 +605,8 @@ void llvm::RecursivelyDeleteTriviallyDeadInstructions(
}
bool llvm::replaceDbgUsesWithUndef(Instruction *I) {
- SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
SmallVector<DbgVariableRecord *, 1> DPUsers;
- findDbgUsers(DbgUsers, I, &DPUsers);
- assert(DbgUsers.empty());
+ findDbgUsers(I, DPUsers);
for (auto *DVR : DPUsers)
DVR->setKillLocation();
return !DPUsers.empty();
@@ -1603,10 +1596,8 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
// Since we can't guarantee that the original dbg.declare intrinsic
// is removed by LowerDbgDeclare(), we need to make sure that we are
// not inserting the same dbg.value intrinsic over and over.
- SmallVector<DbgValueInst *, 1> DbgValues;
SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
- findDbgValues(DbgValues, APN, &DbgVariableRecords);
- assert(DbgValues.empty());
+ findDbgValues(APN, DbgVariableRecords);
for (DbgVariableRecord *DVR : DbgVariableRecords) {
assert(is_contained(DVR->location_ops(), APN));
if ((DVR->getVariable() == DIVar) && (DVR->getExpression() == DIExpr))
@@ -1987,10 +1978,8 @@ static void updateOneDbgValueForAlloca(const DebugLoc &Loc,
void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
DIBuilder &Builder, int Offset) {
- SmallVector<DbgValueInst *, 1> DbgUsers;
SmallVector<DbgVariableRecord *, 1> DPUsers;
- findDbgValues(DbgUsers, AI, &DPUsers);
- assert(DbgUsers.empty());
+ findDbgValues(AI, DPUsers);
// Replace any DbgVariableRecords that use this alloca.
for (DbgVariableRecord *DVR : DPUsers)
@@ -2002,11 +1991,9 @@ void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
/// Where possible to salvage debug information for \p I do so.
/// If not possible mark undef.
void llvm::salvageDebugInfo(Instruction &I) {
- SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
SmallVector<DbgVariableRecord *, 1> DPUsers;
- findDbgUsers(DbgUsers, &I, &DPUsers);
- assert(DbgUsers.empty());
- salvageDebugInfoForDbgValues(I, DbgUsers, DPUsers);
+ findDbgUsers(&I, DPUsers);
+ salvageDebugInfoForDbgValues(I, DPUsers);
}
template <typename T> static void salvageDbgAssignAddress(T *Assign) {
@@ -2044,9 +2031,8 @@ template <typename T> static void salvageDbgAssignAddress(T *Assign) {
}
}
-void llvm::salvageDebugInfoForDbgValues(
- Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers,
- ArrayRef<DbgVariableRecord *> DPUsers) {
+void llvm::salvageDebugInfoForDbgValues(Instruction &I,
+ ArrayRef<DbgVariableRecord *> DPUsers) {
// These are arbitrary chosen limits on the maximum number of values and the
// maximum size of a debug expression we can salvage up to, used for
// performance reasons.
@@ -2054,9 +2040,6 @@ void llvm::salvageDebugInfoForDbgValues(
const unsigned MaxExpressionSize = 128;
bool Salvaged = false;
- // We should never see debug intrinsics nowadays.
- assert(DbgUsers.empty());
-
for (auto *DVR : DPUsers) {
if (DVR->isDbgAssign()) {
if (DVR->getAddress() == &I) {
@@ -2343,16 +2326,11 @@ static bool rewriteDebugUsers(
Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT,
function_ref<DbgValReplacement(DbgVariableRecord &DVR)> RewriteDVRExpr) {
// Find debug users of From.
- SmallVector<DbgVariableIntrinsic *, 1> Users;
SmallVector<DbgVariableRecord *, 1> DPUsers;
- findDbgUsers(Users, &From, &DPUsers);
- if (Users.empty() && DPUsers.empty())
+ findDbgUsers(&From, DPUsers);
+ if (DPUsers.empty())
return false;
- // Ignore intrinsic-users: they are no longer supported and should never
- // appear.
- assert(Users.empty());
-
// Prevent use-before-def of To.
bool Changed = false;
@@ -3356,10 +3334,8 @@ void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
}
void llvm::dropDebugUsers(Instruction &I) {
- SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
SmallVector<DbgVariableRecord *, 1> DPUsers;
- findDbgUsers(DbgUsers, &I, &DPUsers);
- assert(DbgUsers.empty());
+ findDbgUsers(&I, DPUsers);
for (auto *DVR : DPUsers)
DVR->eraseFromParent();
}
@@ -3876,6 +3852,10 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
if (Op->isSwiftError())
return false;
+ // Cannot replace alloca argument with phi/select.
+ if (I->isLifetimeStartOrEnd())
+ return false;
+
// Early exit.
if (!isa<Constant, InlineAsm>(Op))
return true;
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 06115e0..7cc9ff8 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -158,10 +158,8 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
// Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
// intrinsics.
- SmallVector<DbgValueInst *, 1> DbgValues;
SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
- llvm::findDbgValues(DbgValues, OrigHeaderVal, &DbgVariableRecords);
- assert(DbgValues.empty());
+ llvm::findDbgValues(OrigHeaderVal, DbgVariableRecords);
for (DbgVariableRecord *DVR : DbgVariableRecords) {
// The original users in the OrigHeader are already using the original
diff --git a/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp b/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
index 8f55d7b..2743931 100644
--- a/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
@@ -319,9 +319,9 @@ void MemoryOpRemark::visitVariable(const Value *V,
// If we find some information in the debug info, take that.
bool FoundDI = false;
- // Try to get an llvm.dbg.declare, which has a DILocalVariable giving us the
+ // Try to get a dbg.declare, which has a DILocalVariable giving us the
// real debug info name and size of the variable.
- auto FindDI = [&](const auto *DVI) {
+ auto FindDI = [&](const DbgVariableRecord *DVI) {
if (DILocalVariable *DILV = DVI->getVariable()) {
std::optional<uint64_t> DISize = getSizeInBytes(DILV->getSizeInBits());
VariableInfo Var{DILV->getName(), DISize};
@@ -331,7 +331,6 @@ void MemoryOpRemark::visitVariable(const Value *V,
}
}
};
- for_each(findDbgDeclares(const_cast<Value *>(V)), FindDI);
for_each(findDVRDeclares(const_cast<Value *>(V)), FindDI);
if (FoundDI) {
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 40dc02c..bea76d3 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -155,11 +155,7 @@ void StackInfoBuilder::visit(OptimizationRemarkEmitter &ORE,
return;
}
if (auto *II = dyn_cast<LifetimeIntrinsic>(&Inst)) {
- AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
- if (!AI) {
- Info.UnrecognizedLifetimes.push_back(&Inst);
- return;
- }
+ AllocaInst *AI = cast<AllocaInst>(II->getArgOperand(1));
if (getAllocaInterestingness(*AI) != AllocaInterestingness::kInteresting)
return;
if (II->getIntrinsicID() == Intrinsic::lifetime_start)
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index ac413c9..de9deab 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -12,7 +12,6 @@
#include "llvm/Transforms/Utils/PredicateInfo.h"
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/AssumptionCache.h"
diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
new file mode 100644
index 0000000..b972132
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
@@ -0,0 +1,129 @@
+//===- ProfileVerify.cpp - Verify profile info for testing ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ProfileVerify.h"
+#include "llvm/ADT/DynamicAPInt.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
+#include "llvm/Support/BranchProbability.h"
+
+using namespace llvm;
+namespace {
+class ProfileInjector {
+ Function &F;
+ FunctionAnalysisManager &FAM;
+
+public:
+ static const Instruction *
+ getTerminatorBenefitingFromMDProf(const BasicBlock &BB) {
+ if (succ_size(&BB) < 2)
+ return nullptr;
+ auto *Term = BB.getTerminator();
+ return (isa<BranchInst>(Term) || isa<SwitchInst>(Term) ||
+ isa<IndirectBrInst>(Term) || isa<CallBrInst>(Term))
+ ? Term
+ : nullptr;
+ }
+
+ static Instruction *getTerminatorBenefitingFromMDProf(BasicBlock &BB) {
+ return const_cast<Instruction *>(
+ getTerminatorBenefitingFromMDProf(const_cast<const BasicBlock &>(BB)));
+ }
+
+ ProfileInjector(Function &F, FunctionAnalysisManager &FAM) : F(F), FAM(FAM) {}
+ bool inject();
+};
+} // namespace
+
+// FIXME: currently this injects only for terminators. Select isn't yet
+// supported.
+bool ProfileInjector::inject() {
+ // Get whatever branch probability info can be derived from the given IR -
+ // whether it has or not metadata. The main intention for this pass is to
+ // ensure that other passes don't drop or "forget" to update MD_prof. We do
+ // this as a mode in which lit tests would run. We want to avoid changing the
+ // behavior of those tests. A pass may use BPI (or BFI, which is computed from
+ // BPI). If no metadata is present, BPI is guesstimated by
+ // BranchProbabilityAnalysis. The injector (this pass) only persists whatever
+ // information the analysis provides, in other words, the pass being tested
+ // will get the same BPI it does if the injector wasn't running.
+ auto &BPI = FAM.getResult<BranchProbabilityAnalysis>(F);
+
+ bool Changed = false;
+ for (auto &BB : F) {
+ auto *Term = getTerminatorBenefitingFromMDProf(BB);
+ if (!Term || Term->getMetadata(LLVMContext::MD_prof))
+ continue;
+ SmallVector<BranchProbability> Probs;
+ Probs.reserve(Term->getNumSuccessors());
+ for (auto I = 0U, E = Term->getNumSuccessors(); I < E; ++I)
+ Probs.emplace_back(BPI.getEdgeProbability(&BB, Term->getSuccessor(I)));
+
+ assert(llvm::find_if(Probs,
+ [](const BranchProbability &P) {
+ return P.isUnknown();
+ }) == Probs.end() &&
+ "All branch probabilities should be valid");
+ const auto *FirstZeroDenominator =
+ find_if(Probs, [](const BranchProbability &P) {
+ return P.getDenominator() == 0;
+ });
+ (void)FirstZeroDenominator;
+ assert(FirstZeroDenominator == Probs.end());
+ const auto *FirstNonZeroNumerator =
+ find_if(Probs, [](const BranchProbability &P) { return !P.isZero(); });
+ assert(FirstNonZeroNumerator != Probs.end());
+ DynamicAPInt LCM(Probs[0].getDenominator());
+ DynamicAPInt GCD(FirstNonZeroNumerator->getNumerator());
+ for (const auto &Prob : drop_begin(Probs)) {
+ if (!Prob.getNumerator())
+ continue;
+ LCM = llvm::lcm(LCM, DynamicAPInt(Prob.getDenominator()));
+ GCD = llvm::gcd(GCD, DynamicAPInt(Prob.getNumerator()));
+ }
+ SmallVector<uint32_t> Weights;
+ Weights.reserve(Term->getNumSuccessors());
+ for (const auto &Prob : Probs) {
+ DynamicAPInt W =
+ (Prob.getNumerator() * LCM / GCD) / Prob.getDenominator();
+ Weights.emplace_back(static_cast<uint32_t>((int64_t)W));
+ }
+ setBranchWeights(*Term, Weights, /*IsExpected=*/false);
+ Changed = true;
+ }
+ return Changed;
+}
+
+PreservedAnalyses ProfileInjectorPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ ProfileInjector PI(F, FAM);
+ if (!PI.inject())
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+PreservedAnalyses ProfileVerifierPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ for (const auto &BB : F)
+ if (const auto *Term =
+ ProfileInjector::getTerminatorBenefitingFromMDProf(BB))
+ if (!Term->getMetadata(LLVMContext::MD_prof))
+ F.getContext().emitError("Profile verification failed");
+
+ return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 73b5f48..d96f1d6 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -243,10 +243,8 @@ struct AllocaInfo {
OnlyUsedInOneBlock = false;
}
}
- SmallVector<DbgVariableIntrinsic *> AllDbgUsers;
SmallVector<DbgVariableRecord *> AllDPUsers;
- findDbgUsers(AllDbgUsers, AI, &AllDPUsers);
- assert(AllDbgUsers.empty());
+ findDbgUsers(AI, AllDPUsers);
std::copy_if(AllDPUsers.begin(), AllDPUsers.end(),
std::back_inserter(DPUsers),
[](DbgVariableRecord *DVR) { return !DVR->isDbgAssign(); });
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 586874f..b9292af 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -19,7 +19,9 @@
#include "llvm/Analysis/ValueLattice.h"
#include "llvm/Analysis/ValueLatticeUtils.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/NoFolder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
@@ -245,11 +247,43 @@ static Value *simplifyInstruction(SCCPSolver &Solver,
const APInt *RHSC;
// Remove masking operations.
if (match(&Inst, m_And(m_Value(X), m_LowBitMask(RHSC)))) {
- ConstantRange LRange = GetRange(Inst.getOperand(0));
+ ConstantRange LRange = GetRange(X);
if (LRange.getUnsignedMax().ule(*RHSC))
return X;
}
+ // Check if we can simplify [us]cmp(X, Y) to X - Y.
+ if (auto *Cmp = dyn_cast<CmpIntrinsic>(&Inst)) {
+ Value *LHS = Cmp->getOperand(0);
+ Value *RHS = Cmp->getOperand(1);
+ unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
+ // Bail out on 1-bit comparisons.
+ if (BitWidth == 1)
+ return nullptr;
+ ConstantRange LRange = GetRange(LHS);
+ if (LRange.isSizeLargerThan(3))
+ return nullptr;
+ ConstantRange RRange = GetRange(RHS);
+ if (RRange.isSizeLargerThan(3))
+ return nullptr;
+ ConstantRange RHSLower = RRange.sub(APInt(BitWidth, 1));
+ ConstantRange RHSUpper = RRange.add(APInt(BitWidth, 1));
+ ICmpInst::Predicate Pred =
+ Cmp->isSigned() ? CmpInst::ICMP_SLE : CmpInst::ICMP_ULE;
+ if (!RHSLower.icmp(Pred, LRange) || !LRange.icmp(Pred, RHSUpper))
+ return nullptr;
+
+ IRBuilder<NoFolder> Builder(&Inst);
+ Value *Sub = Builder.CreateSub(LHS, RHS, Inst.getName(), /*HasNUW=*/false,
+ /*HasNSW=*/Cmp->isSigned());
+ InsertedValues.insert(Sub);
+ if (Sub->getType() != Inst.getType()) {
+ Sub = Builder.CreateSExtOrTrunc(Sub, Inst.getType());
+ InsertedValues.insert(Sub);
+ }
+ return Sub;
+ }
+
return nullptr;
}
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 561c898..49d0d95 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -197,10 +197,8 @@ void SSAUpdater::RewriteUse(Use &U) {
}
void SSAUpdater::UpdateDebugValues(Instruction *I) {
- SmallVector<DbgValueInst *, 4> DbgValues;
SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
- llvm::findDbgValues(DbgValues, I, &DbgVariableRecords);
- assert(DbgValues.empty());
+ llvm::findDbgValues(I, DbgVariableRecords);
for (auto &DVR : DbgVariableRecords) {
if (DVR->getParent() == I->getParent())
continue;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 75c9650..94b0ab8 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2227,16 +2227,6 @@ static bool canSinkInstructions(
return I->getOperand(OI) == I0->getOperand(OI);
};
if (!all_of(Insts, SameAsI0)) {
- // SROA can't speculate lifetime markers of selects/phis, and the
- // backend may handle such lifetimes incorrectly as well (#104776).
- // Don't sink lifetimes if it would introduce a phi on the pointer
- // argument.
- if (isa<LifetimeIntrinsic>(I0) && OI == 1 &&
- any_of(Insts, [](const Instruction *I) {
- return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
- }))
- return false;
-
if ((isa<Constant>(Op) && !replacingOperandWithVariableIsCheap(I0, OI)) ||
!canReplaceOperandWithVariable(I0, OI))
// We can't create a PHI from this GEP.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6e42063..99a96a8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1354,9 +1354,10 @@ public:
ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
ForceTailFoldingStyle.getValue()};
- if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
+ if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
+ ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
return;
- // Override forced styles if needed.
+ // Override EVL styles if needed.
// FIXME: Investigate opportunity for fixed vector factor.
bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
@@ -1505,6 +1506,11 @@ private:
ElementCount UserVF,
bool FoldTailByMasking);
+ /// If \p VF > MaxTripcount, clamps it to the next lower VF that is <=
+ /// MaxTripCount.
+ ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
+ bool FoldTailByMasking) const;
+
/// \return the maximized element count based on the targets vector
/// registers and the loop trip-count, but limited to a maximum safe VF.
/// This is a helper function of computeFeasibleMaxVF.
@@ -3854,6 +3860,38 @@ bool LoopVectorizationCostModel::useMaxBandwidth(
Legal->hasVectorCallVariants())));
}
+ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
+ ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const {
+ unsigned EstimatedVF = VF.getKnownMinValue();
+ if (VF.isScalable() && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
+ auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
+ auto Min = Attr.getVScaleRangeMin();
+ EstimatedVF *= Min;
+ }
+
+ // When a scalar epilogue is required, at least one iteration of the scalar
+ // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
+ // max VF that results in a dead vector loop.
+ if (MaxTripCount > 0 && requiresScalarEpilogue(true))
+ MaxTripCount -= 1;
+
+ if (MaxTripCount && MaxTripCount <= EstimatedVF &&
+ (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
+ // If upper bound loop trip count (TC) is known at compile time there is no
+ // point in choosing VF greater than TC (as done in the loop below). Select
+ // maximum power of two which doesn't exceed TC. If VF is
+ // scalable, we only fall back on a fixed VF when the TC is less than or
+ // equal to the known number of lanes.
+ auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
+ LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
+ "exceeding the constant trip count: "
+ << ClampedUpperTripCount << "\n");
+ return ElementCount::get(ClampedUpperTripCount,
+ FoldTailByMasking ? VF.isScalable() : false);
+ }
+ return VF;
+}
+
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
ElementCount MaxSafeVF, bool FoldTailByMasking) {
@@ -3885,40 +3923,16 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
return ElementCount::getFixed(1);
}
- unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
- if (MaxVectorElementCount.isScalable() &&
- TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
- auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
- auto Min = Attr.getVScaleRangeMin();
- WidestRegisterMinEC *= Min;
- }
-
- // When a scalar epilogue is required, at least one iteration of the scalar
- // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
- // max VF that results in a dead vector loop.
- if (MaxTripCount > 0 && requiresScalarEpilogue(true))
- MaxTripCount -= 1;
-
- if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
- (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
- // If upper bound loop trip count (TC) is known at compile time there is no
- // point in choosing VF greater than TC (as done in the loop below). Select
- // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
- // scalable, we only fall back on a fixed VF when the TC is less than or
- // equal to the known number of lanes.
- auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
- LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
- "exceeding the constant trip count: "
- << ClampedUpperTripCount << "\n");
- return ElementCount::get(
- ClampedUpperTripCount,
- FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
- }
+ ElementCount MaxVF = clampVFByMaxTripCount(MaxVectorElementCount,
+ MaxTripCount, FoldTailByMasking);
+ // If the MaxVF was already clamped, there's no point in trying to pick a
+ // larger one.
+ if (MaxVF != MaxVectorElementCount)
+ return MaxVF;
TargetTransformInfo::RegisterKind RegKind =
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector;
- ElementCount MaxVF = MaxVectorElementCount;
if (MaxVF.isScalable())
MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
@@ -3940,10 +3954,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
}
}
- // Invalidate any widening decisions we might have made, in case the loop
- // requires prediction (decided later), but we have already made some
- // load/store widening decisions.
- invalidateCostModelingDecisions();
+ MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, FoldTailByMasking);
+
+ if (MaxVectorElementCount != MaxVF) {
+ // Invalidate any widening decisions we might have made, in case the loop
+ // requires prediction (decided later), but we have already made some
+ // load/store widening decisions.
+ invalidateCostModelingDecisions();
+ }
}
return MaxVF;
}
@@ -4479,6 +4497,28 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
Type *TCType = Legal->getWidestInductionType();
const SCEV *RemainingIterations = nullptr;
unsigned MaxTripCount = 0;
+ if (MainLoopVF.isFixed()) {
+ // TODO: extend to support scalable VFs.
+ const SCEV *TC = vputils::getSCEVExprForVPValue(
+ getPlanFor(MainLoopVF).getTripCount(), SE);
+ assert(!isa<SCEVCouldNotCompute>(TC) &&
+ "Trip count SCEV must be computable");
+ RemainingIterations = SE.getURemExpr(
+ TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
+
+ // No iterations left to process in the epilogue.
+ if (RemainingIterations->isZero())
+ return Result;
+
+ MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
+ if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
+ SE.getConstant(TCType, MaxTripCount))) {
+ MaxTripCount = SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
+ }
+ LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
+ << MaxTripCount << "\n");
+ }
+
for (auto &NextVF : ProfitableVFs) {
// Skip candidate VFs without a corresponding VPlan.
if (!hasPlanWithVF(NextVF.Width))
@@ -4496,24 +4536,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
// If NextVF is greater than the number of remaining iterations, the
// epilogue loop would be dead. Skip such factors.
- if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
- // TODO: extend to support scalable VFs.
- if (!RemainingIterations) {
- const SCEV *TC = vputils::getSCEVExprForVPValue(
- getPlanFor(NextVF.Width).getTripCount(), SE);
- assert(!isa<SCEVCouldNotCompute>(TC) &&
- "Trip count SCEV must be computable");
- RemainingIterations = SE.getURemExpr(
- TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
- MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
- if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
- SE.getConstant(TCType, MaxTripCount))) {
- MaxTripCount =
- SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
- }
- LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
- << MaxTripCount << "\n");
- }
+ if (RemainingIterations && !NextVF.Width.isScalable()) {
if (SE.isKnownPredicate(
CmpInst::ICMP_UGT,
SE.getConstant(TCType, NextVF.Width.getFixedValue()),
@@ -8793,8 +8816,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// Apply mandatory transformation to handle FP maxnum/minnum reduction with
// NaNs if possible, bail out otherwise.
- if (!VPlanTransforms::runPass(
- VPlanTransforms::handleMaxMinNumReductionsWithoutFastMath, *Plan))
+ if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
+ *Plan))
return nullptr;
// Transform recipes to abstract recipes if it is legal and beneficial and
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 204268e..9b67b7d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1662,6 +1662,8 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
VPSlotTracker &SlotTracker) const override;
#endif
+ unsigned getOpcode() const { return Instruction::Select; }
+
VPValue *getCond() const {
return getOperand(0);
}
@@ -2335,8 +2337,9 @@ public:
return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized());
}
- /// Generate the phi/select nodes.
- void execute(VPTransformState &State) override;
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends");
+ }
/// Return the cost of this VPWidenMemoryRecipe.
InstructionCost computeCost(ElementCount VF,
@@ -4188,13 +4191,11 @@ public:
return VPB;
}
- /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set
- /// to nullptr. If \p IsReplicator is true, the region is a replicate region.
- /// The returned block is owned by the VPlan and deleted once the VPlan is
- /// destroyed.
- VPRegionBlock *createVPRegionBlock(const std::string &Name = "",
- bool IsReplicator = false) {
- auto *VPB = new VPRegionBlock(Name, IsReplicator);
+ /// Create a new loop VPRegionBlock with \p Name and entry and exiting blocks set
+ /// to nullptr. The returned block is owned by the VPlan and deleted once the
+ /// VPlan is destroyed.
+ VPRegionBlock *createVPRegionBlock(const std::string &Name = "") {
+ auto *VPB = new VPRegionBlock(Name);
CreatedBlocks.push_back(VPB);
return VPB;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index ba1f9aa..194874a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -411,7 +411,7 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) {
// LatchExitVPB, taking care to preserve the original predecessor & successor
// order of blocks. Set region entry and exiting after both HeaderVPB and
// LatchVPBB have been disconnected from their predecessors/successors.
- auto *R = Plan.createVPRegionBlock("", false /*isReplicator*/);
+ auto *R = Plan.createVPRegionBlock();
VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, R);
VPBlockUtils::disconnectBlocks(LatchVPBB, R);
VPBlockUtils::connectBlocks(PreheaderVPBB, R);
@@ -653,7 +653,7 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
}
}
-bool VPlanTransforms::handleMaxMinNumReductionsWithoutFastMath(VPlan &Plan) {
+bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
RedPhiR->getBackedgeValue()->getDefiningRecipe());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 57b713d..b2066ce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -991,7 +991,13 @@ bool VPInstruction::isVectorToScalar() const {
}
bool VPInstruction::isSingleScalar() const {
- return getOpcode() == Instruction::PHI || isScalarCast();
+ switch (getOpcode()) {
+ case Instruction::PHI:
+ case VPInstruction::ExplicitVectorLength:
+ return true;
+ default:
+ return isScalarCast();
+ }
}
void VPInstruction::execute(VPTransformState &State) {
@@ -2411,42 +2417,6 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-void VPBlendRecipe::execute(VPTransformState &State) {
- assert(isNormalized() && "Expected blend to be normalized!");
- // We know that all PHIs in non-header blocks are converted into
- // selects, so we don't have to worry about the insertion order and we
- // can just use the builder.
- // At this point we generate the predication tree. There may be
- // duplications since this is a simple recursive scan, but future
- // optimizations will clean it up.
-
- unsigned NumIncoming = getNumIncomingValues();
-
- // Generate a sequence of selects of the form:
- // SELECT(Mask3, In3,
- // SELECT(Mask2, In2,
- // SELECT(Mask1, In1,
- // In0)))
- // Note that Mask0 is never used: lanes for which no path reaches this phi and
- // are essentially undef are taken from In0.
- bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
- Value *Result = nullptr;
- for (unsigned In = 0; In < NumIncoming; ++In) {
- // We might have single edge PHIs (blocks) - use an identity
- // 'select' for the first PHI operand.
- Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
- if (In == 0)
- Result = In0; // Initialize with the first incoming value.
- else {
- // Select between the current value and the previous incoming edge
- // based on the incoming mask.
- Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
- Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
- }
- }
- State.set(this, Result, OnlyFirstLaneUsed);
-}
-
InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
// Handle cases where only the first lane is used the same way as the legacy
@@ -3445,7 +3415,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
VPValue *BlockInMask = getMask();
VPValue *Addr = getAddr();
Value *ResAddr = State.get(Addr, VPLane(0));
- Value *PoisonVec = PoisonValue::get(VecTy);
auto CreateGroupMask = [&BlockInMask, &State,
&InterleaveFactor](Value *MaskForGaps) -> Value * {
@@ -3484,6 +3453,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
Instruction *NewLoad;
if (BlockInMask || MaskForGaps) {
Value *GroupMask = CreateGroupMask(MaskForGaps);
+ Value *PoisonVec = PoisonValue::get(VecTy);
NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
Group->getAlign(), GroupMask,
PoisonVec, "wide.masked.vec");
@@ -3493,57 +3463,39 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
Group->addMetadata(NewLoad);
ArrayRef<VPValue *> VPDefs = definedValues();
- const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
if (VecTy->isScalableTy()) {
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
assert(InterleaveFactor <= 8 &&
"Unsupported deinterleave factor for scalable vectors");
- Value *Deinterleave = State.Builder.CreateIntrinsic(
+ NewLoad = State.Builder.CreateIntrinsic(
getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
NewLoad,
/*FMFSource=*/nullptr, "strided.vec");
+ }
- for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
- Instruction *Member = Group->getMember(I);
- Value *StridedVec = State.Builder.CreateExtractValue(Deinterleave, I);
- if (!Member) {
- // This value is not needed as it's not used
- cast<Instruction>(StridedVec)->eraseFromParent();
- continue;
- }
- // If this member has different type, cast the result type.
- if (Member->getType() != ScalarTy) {
- VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
- StridedVec =
- createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
- }
-
- if (Group->isReverse())
- StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
-
- State.set(VPDefs[J], StridedVec);
- ++J;
- }
+ auto CreateStridedVector = [&InterleaveFactor, &State,
+ &NewLoad](unsigned Index) -> Value * {
+ assert(Index < InterleaveFactor && "Illegal group index");
+ if (State.VF.isScalable())
+ return State.Builder.CreateExtractValue(NewLoad, Index);
- return;
- }
- assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
+ // For fixed length VF, use shuffle to extract the sub-vectors from the
+ // wide load.
+ auto StrideMask =
+ createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
+ return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
+ "strided.vec");
+ };
- // For each member in the group, shuffle out the appropriate data from the
- // wide loads.
- unsigned J = 0;
- for (unsigned I = 0; I < InterleaveFactor; ++I) {
+ for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
Instruction *Member = Group->getMember(I);
// Skip the gaps in the group.
if (!Member)
continue;
- auto StrideMask =
- createStrideMask(I, InterleaveFactor, State.VF.getFixedValue());
- Value *StridedVec =
- State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
+ Value *StridedVec = CreateStridedVector(I);
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2a92083..3372bcc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -997,7 +997,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
// InstSimplifyFolder.
if (TypeSwitch<VPRecipeBase *, bool>(&R)
.Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
- VPReplicateRecipe>([&](auto *I) {
+ VPReplicateRecipe, VPWidenSelectRecipe>([&](auto *I) {
const DataLayout &DL =
Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,
@@ -1481,9 +1481,9 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
// (BranchOnCond true).
auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
- if (all_of(
- Header->phis(),
- IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
+ if (all_of(Header->phis(),
+ IsaPred<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
+ VPFirstOrderRecurrencePHIRecipe>)) {
for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(&HeaderR);
HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue());
@@ -2711,6 +2711,18 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
continue;
}
+ // Expand VPBlendRecipe into VPInstruction::Select.
+ VPBuilder Builder(&R);
+ if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
+ VPValue *Select = Blend->getIncomingValue(0);
+ for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
+ Select = Builder.createSelect(Blend->getMask(I),
+ Blend->getIncomingValue(I), Select,
+ R.getDebugLoc(), "predphi");
+ Blend->replaceAllUsesWith(Select);
+ ToRemove.push_back(Blend);
+ }
+
if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
Expr->decompose();
ToRemove.push_back(Expr);
@@ -2724,7 +2736,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
// Expand WideIVStep.
auto *VPI = cast<VPInstruction>(&R);
- VPBuilder Builder(VPI);
Type *IVTy = TypeInfo.inferScalarType(VPI);
if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
Instruction::CastOps CastOp = IVTy->isFloatingPointTy()
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 04cb7a7..ab189f6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -107,7 +107,7 @@ struct VPlanTransforms {
/// try to update the vector loop to exit early if any input is NaN and resume
/// executing in the scalar loop to handle the NaNs there. Return false if
/// this attempt was unsuccessful.
- static bool handleMaxMinNumReductionsWithoutFastMath(VPlan &Plan);
+ static bool handleMaxMinNumReductions(VPlan &Plan);
/// Clear NSW/NUW flags from reduction instructions if necessary.
static void clearReductionWrapFlags(VPlan &Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fe8d74c..82adc34 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -115,7 +115,7 @@ private:
bool foldInsExtFNeg(Instruction &I);
bool foldInsExtBinop(Instruction &I);
bool foldInsExtVectorToShuffle(Instruction &I);
- bool foldBitOpOfBitcasts(Instruction &I);
+ bool foldBitOpOfCastops(Instruction &I);
bool foldBitcastShuffle(Instruction &I);
bool scalarizeOpOrCmp(Instruction &I);
bool scalarizeVPIntrinsic(Instruction &I);
@@ -808,48 +808,87 @@ bool VectorCombine::foldInsExtBinop(Instruction &I) {
return true;
}
-bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
- // Match: bitop(bitcast(x), bitcast(y)) -> bitcast(bitop(x, y))
- Value *LHSSrc, *RHSSrc;
- if (!match(&I, m_BitwiseLogic(m_BitCast(m_Value(LHSSrc)),
- m_BitCast(m_Value(RHSSrc)))))
+/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
+/// Supports: bitcast, trunc, sext, zext
+bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
+ // Check if this is a bitwise logic operation
+ auto *BinOp = dyn_cast<BinaryOperator>(&I);
+ if (!BinOp || !BinOp->isBitwiseLogicOp())
return false;
+ // Get the cast instructions
+ auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
+ auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
+ if (!LHSCast || !RHSCast) {
+ LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
+ return false;
+ }
+
+ // Both casts must be the same type
+ Instruction::CastOps CastOpcode = LHSCast->getOpcode();
+ if (CastOpcode != RHSCast->getOpcode())
+ return false;
+
+ // Only handle supported cast operations
+ switch (CastOpcode) {
+ case Instruction::BitCast:
+ case Instruction::Trunc:
+ case Instruction::SExt:
+ case Instruction::ZExt:
+ break;
+ default:
+ return false;
+ }
+
+ Value *LHSSrc = LHSCast->getOperand(0);
+ Value *RHSSrc = RHSCast->getOperand(0);
+
// Source types must match
if (LHSSrc->getType() != RHSSrc->getType())
return false;
- if (!LHSSrc->getType()->getScalarType()->isIntegerTy())
- return false;
- // Only handle vector types
+ // Only handle vector types with integer elements
auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType());
auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
if (!SrcVecTy || !DstVecTy)
return false;
- // Same total bit width
- assert(SrcVecTy->getPrimitiveSizeInBits() ==
- DstVecTy->getPrimitiveSizeInBits() &&
- "Bitcast should preserve total bit width");
+ if (!SrcVecTy->getScalarType()->isIntegerTy() ||
+ !DstVecTy->getScalarType()->isIntegerTy())
+ return false;
// Cost Check :
- // OldCost = bitlogic + 2*bitcasts
- // NewCost = bitlogic + bitcast
- auto *BinOp = cast<BinaryOperator>(&I);
+ // OldCost = bitlogic + 2*casts
+ // NewCost = bitlogic + cast
+
+ // Calculate specific costs for each cast with instruction context
+ InstructionCost LHSCastCost =
+ TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
+ TTI::CastContextHint::None, CostKind, LHSCast);
+ InstructionCost RHSCastCost =
+ TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
+ TTI::CastContextHint::None, CostKind, RHSCast);
+
InstructionCost OldCost =
- TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy) +
- TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, LHSSrc->getType(),
- TTI::CastContextHint::None) +
- TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, RHSSrc->getType(),
- TTI::CastContextHint::None);
+ TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy, CostKind) +
+ LHSCastCost + RHSCastCost;
+
+ // For new cost, we can't provide an instruction (it doesn't exist yet)
+ InstructionCost GenericCastCost = TTI.getCastInstrCost(
+ CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind);
+
InstructionCost NewCost =
- TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy) +
- TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, SrcVecTy,
- TTI::CastContextHint::None);
+ TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy, CostKind) +
+ GenericCastCost;
- LLVM_DEBUG(dbgs() << "Found a bitwise logic op of bitcasted values: " << I
- << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
- << "\n");
+ // Account for multi-use casts using specific costs
+ if (!LHSCast->hasOneUse())
+ NewCost += LHSCastCost;
+ if (!RHSCast->hasOneUse())
+ NewCost += RHSCastCost;
+
+ LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
+ << " NewCost=" << NewCost << "\n");
if (NewCost > OldCost)
return false;
@@ -862,8 +901,16 @@ bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
Worklist.pushValue(NewOp);
- // Bitcast the result back
- Value *Result = Builder.CreateBitCast(NewOp, I.getType());
+ // Create the cast operation directly to ensure we get a new instruction
+ Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
+
+ // Preserve cast instruction flags
+ NewCast->copyIRFlags(LHSCast);
+ NewCast->andIRFlags(RHSCast);
+
+ // Insert the new instruction
+ Value *Result = Builder.Insert(NewCast);
+
replaceValue(I, *Result);
return true;
}
@@ -3773,7 +3820,7 @@ bool VectorCombine::run() {
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
- MadeChange |= foldBitOpOfBitcasts(I);
+ MadeChange |= foldBitOpOfCastops(I);
break;
default:
MadeChange |= shrinkType(I);
diff --git a/llvm/test/Analysis/BasicAA/modref.ll b/llvm/test/Analysis/BasicAA/modref.ll
index 0619f8e..1aab28f3 100644
--- a/llvm/test/Analysis/BasicAA/modref.ll
+++ b/llvm/test/Analysis/BasicAA/modref.ll
@@ -67,27 +67,33 @@ define i8 @test2a(ptr %P) {
ret i8 %A
}
-define void @test3(ptr %P, i8 %X) {
+define void @test3(i8 %X) {
; CHECK-LABEL: @test3(
-; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 2
+; CHECK-NEXT: [[P:%.*]] = alloca i64, align 8
+; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i32 2
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[P]])
; CHECK-NEXT: store i8 2, ptr [[P2]], align 1
+; CHECK-NEXT: call void @external(ptr [[P]])
; CHECK-NEXT: ret void
;
+ %P = alloca i64
%Y = add i8 %X, 1 ;; Dead, because the only use (the store) is dead.
%P2 = getelementptr i8, ptr %P, i32 2
store i8 %Y, ptr %P2 ;; Not read by lifetime.end, should be removed.
call void @llvm.lifetime.end.p0(i64 1, ptr %P)
store i8 2, ptr %P2
+ call void @external(ptr %P)
ret void
}
-define void @test3a(ptr %P, i8 %X) {
+define void @test3a(i8 %X) {
; CHECK-LABEL: @test3a(
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 10, ptr [[P:%.*]])
+; CHECK-NEXT: [[P:%.*]] = alloca i64, align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 10, ptr [[P]])
; CHECK-NEXT: ret void
;
+ %P = alloca i64
%Y = add i8 %X, 1 ;; Dead, because the only use (the store) is dead.
%P2 = getelementptr i8, ptr %P, i32 2
diff --git a/llvm/test/Analysis/CallGraph/ignore-assumelike-calls.ll b/llvm/test/Analysis/CallGraph/ignore-assumelike-calls.ll
index 658d738..1c9d201 100644
--- a/llvm/test/Analysis/CallGraph/ignore-assumelike-calls.ll
+++ b/llvm/test/Analysis/CallGraph/ignore-assumelike-calls.ll
@@ -10,7 +10,7 @@
; CHECK-EMPTY:
; CHECK-NEXT: Call graph node for function: 'bitcast_only'<<{{.*}}>> #uses=0
; CHECK-EMPTY:
-; CHECK-NEXT: Call graph node for function: 'llvm.lifetime.start.p0'<<{{.*}}>> #uses=3
+; CHECK-NEXT: Call graph node for function: 'llvm.lifetime.start.p0'<<{{.*}}>> #uses=2
; CHECK-EMPTY:
; CHECK-NEXT: Call graph node for function: 'llvm.memset.p0.i64'<<{{.*}}>> #uses=2
; CHECK-EMPTY:
@@ -25,18 +25,11 @@
; CHECK-NEXT: Call graph node for function: 'used_by_lifetime'<<{{.*}}>> #uses=0
; CHECK-NEXT: CS<{{.*}}> calls function 'llvm.lifetime.start.p0'
; CHECK-EMPTY:
-; CHECK-NEXT: Call graph node for function: 'used_by_lifetime_cast'<<{{.*}}>> #uses=0
-; CHECK-NEXT: CS<{{.*}}> calls function 'llvm.lifetime.start.p0'
-; CHECK-EMPTY:
define internal void @used_by_lifetime() {
entry:
- call void @llvm.lifetime.start.p0(i64 4, ptr @used_by_lifetime)
- ret void
-}
-
-define internal void @used_by_lifetime_cast() addrspace(1) {
- call void @llvm.lifetime.start.p0(i64 4, ptr addrspacecast (ptr addrspace(1) @used_by_lifetime_cast to ptr))
+ %a = alloca i8
+ call void @llvm.lifetime.start.p0(i64 4, ptr %a)
ret void
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll
index 117315c..805b3713 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll
@@ -31,3 +31,24 @@ define void @sve_fpext() {
ret void
}
+
+define void @sve_fpext_bf16() {
+; CHECK-LABEL: 'sve_fpext_bf16'
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_to_f32 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f32 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f32 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_to_f64 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f64 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f64 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %nxv2_f16_to_f32 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x float>
+ %nxv4_f16_to_f32 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x float>
+ %nxv8_f16_to_f32 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x float>
+
+ %nxv2_f16_to_f64 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x double>
+ %nxv4_f16_to_f64 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x double>
+ %nxv8_f16_to_f64 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x double>
+
+ ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
index a17c6ce..bb31ebf 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -S -o - < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -S -o - < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOBF16
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve,+bf16 -S -o - < %s | FileCheck %s --check-prefixes=CHECK,CHECK-BF16
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"
@@ -31,3 +32,27 @@ define void @sve_fptruncs() {
ret void
}
+
+define void @sve_fptruncs_bf16() {
+; CHECK-LABEL: 'sve_fptruncs_bf16'
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x bfloat>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x bfloat>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x bfloat>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x bfloat>
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+ %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
+ %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
+ %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x bfloat>
+
+ %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x bfloat>
+ %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x bfloat>
+ %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x bfloat>
+
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-BF16: {{.*}}
+; CHECK-NOBF16: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index ee485e2..7e8d957 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -449,33 +449,33 @@ define void @vector_reverse() #0 {
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; TYPE_BASED_ONLY-LABEL: 'vector_reverse'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
diff --git a/llvm/test/Analysis/CostModel/ARM/arith.ll b/llvm/test/Analysis/CostModel/ARM/arith.ll
index 8f17359..3e9b61b 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith.ll
@@ -1,74 +1,61 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve,+mve1beat < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE1
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE2
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve,+mve4beat < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE4
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=thumbv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve,+mve1beat < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE1
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE2
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve,+mve4beat < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE4
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
define void @i1() {
; CHECK-LABEL: 'i1'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i1 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i1 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i1 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i1 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i1 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i1 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i1 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i1 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i1 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %c = add i1 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %d = sub i1 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %e = mul i1 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %f = ashr i1 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %g = lshr i1 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %h = shl i1 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %i = and i1 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %j = or i1 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %k = xor i1 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-V8M-MAIN-LABEL: 'i1'
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i1 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i1 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i1 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i1 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i1 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i1 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i1 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i1 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i1 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %c = add i1 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %d = sub i1 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %e = mul i1 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %f = ashr i1 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %g = lshr i1 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %h = shl i1 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %i = and i1 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %j = or i1 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %k = xor i1 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8M-BASE-LABEL: 'i1'
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i1 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i1 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i1 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i1 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i1 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i1 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i1 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i1 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i1 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %c = add i1 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %d = sub i1 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %e = mul i1 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %f = ashr i1 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %g = lshr i1 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %h = shl i1 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %i = and i1 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %j = or i1 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %k = xor i1 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8R-LABEL: 'i1'
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i1 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i1 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i1 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i1 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i1 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i1 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i1 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i1 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i1 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'i1'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i1 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i1 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i1 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i1 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i1 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i1 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i = and i1 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %j = or i1 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k = xor i1 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c = add i1 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d = sub i1 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %e = mul i1 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %f = ashr i1 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %g = lshr i1 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %h = shl i1 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i = and i1 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j = or i1 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k = xor i1 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c = add i1 undef, undef
%d = sub i1 undef, undef
@@ -84,64 +71,52 @@ define void @i1() {
define void @i8() {
; CHECK-LABEL: 'i8'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i8 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i8 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i8 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i8 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i8 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i8 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i8 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i8 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i8 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %c = add i8 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %d = sub i8 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %e = mul i8 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %f = ashr i8 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %g = lshr i8 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %h = shl i8 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %i = and i8 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %j = or i8 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %k = xor i8 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-V8M-MAIN-LABEL: 'i8'
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i8 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i8 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i8 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i8 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i8 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i8 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i8 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i8 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i8 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %c = add i8 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %d = sub i8 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %e = mul i8 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %f = ashr i8 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %g = lshr i8 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %h = shl i8 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %i = and i8 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %j = or i8 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %k = xor i8 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8M-BASE-LABEL: 'i8'
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i8 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i8 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i8 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i8 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i8 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i8 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i8 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i8 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i8 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %c = add i8 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %d = sub i8 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %e = mul i8 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %f = ashr i8 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %g = lshr i8 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %h = shl i8 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %i = and i8 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %j = or i8 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %k = xor i8 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8R-LABEL: 'i8'
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i8 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i8 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i8 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i8 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i8 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i8 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i8 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i8 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i8 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'i8'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i8 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i8 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i8 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i8 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i8 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i8 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i8 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i8 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i8 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c = add i8 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d = sub i8 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %e = mul i8 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %f = ashr i8 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %g = lshr i8 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %h = shl i8 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i = and i8 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j = or i8 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k = xor i8 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c = add i8 undef, undef
%d = sub i8 undef, undef
@@ -157,64 +132,52 @@ define void @i8() {
define void @i16() {
; CHECK-LABEL: 'i16'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i16 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i16 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i16 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i16 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i16 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i16 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i16 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i16 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i16 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %c = add i16 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %d = sub i16 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %e = mul i16 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %f = ashr i16 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %g = lshr i16 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %h = shl i16 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %i = and i16 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %j = or i16 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %k = xor i16 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-V8M-MAIN-LABEL: 'i16'
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i16 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i16 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i16 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i16 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i16 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i16 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i16 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i16 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i16 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %c = add i16 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %d = sub i16 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %e = mul i16 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %f = ashr i16 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %g = lshr i16 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %h = shl i16 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %i = and i16 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %j = or i16 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %k = xor i16 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8M-BASE-LABEL: 'i16'
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i16 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i16 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i16 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i16 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i16 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i16 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i16 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i16 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i16 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %c = add i16 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %d = sub i16 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %e = mul i16 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %f = ashr i16 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %g = lshr i16 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %h = shl i16 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %i = and i16 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %j = or i16 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %k = xor i16 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8R-LABEL: 'i16'
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i16 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i16 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i16 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i16 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i16 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i16 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i16 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i16 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i16 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'i16'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i16 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i16 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i16 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i16 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i16 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i16 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i16 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i16 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i16 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c = add i16 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d = sub i16 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %e = mul i16 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %f = ashr i16 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %g = lshr i16 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %h = shl i16 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i = and i16 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j = or i16 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k = xor i16 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c = add i16 undef, undef
%d = sub i16 undef, undef
@@ -230,64 +193,52 @@ define void @i16() {
define void @i32() {
; CHECK-LABEL: 'i32'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i32 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i32 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i32 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i32 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i32 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i32 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i32 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i32 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i32 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %c = add i32 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %d = sub i32 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %e = mul i32 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %f = ashr i32 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %g = lshr i32 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %h = shl i32 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %i = and i32 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %j = or i32 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %k = xor i32 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-V8M-MAIN-LABEL: 'i32'
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i32 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i32 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i32 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i32 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i32 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i32 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i32 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i32 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i32 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %c = add i32 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %d = sub i32 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %e = mul i32 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %f = ashr i32 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %g = lshr i32 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %h = shl i32 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %i = and i32 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %j = or i32 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: %k = xor i32 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8M-BASE-LABEL: 'i32'
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i32 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i32 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i32 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i32 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i32 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i32 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i32 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i32 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i32 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %c = add i32 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %d = sub i32 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %e = mul i32 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %f = ashr i32 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %g = lshr i32 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %h = shl i32 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %i = and i32 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %j = or i32 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: %k = xor i32 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8R-LABEL: 'i32'
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i32 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i32 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i32 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i32 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i32 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i32 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i32 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i32 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i32 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'i32'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = add i32 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = sub i32 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = mul i32 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = ashr i32 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = lshr i32 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = shl i32 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i = and i32 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j = or i32 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k = xor i32 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c = add i32 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d = sub i32 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %e = mul i32 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %f = ashr i32 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %g = lshr i32 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %h = shl i32 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i = and i32 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j = or i32 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k = xor i32 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c = add i32 undef, undef
%d = sub i32 undef, undef
@@ -303,64 +254,52 @@ define void @i32() {
define void @i64() {
; CHECK-LABEL: 'i64'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c = add i64 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d = sub i64 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e = mul i64 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f = ashr i64 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g = lshr i64 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h = shl i64 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i = and i64 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j = or i64 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k = xor i64 undef, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %c = add i64 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %d = sub i64 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %e = mul i64 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %f = ashr i64 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %g = lshr i64 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %h = shl i64 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %i = and i64 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %j = or i64 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %k = xor i64 undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-V8M-MAIN-LABEL: 'i64'
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c = add i64 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d = sub i64 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e = mul i64 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f = ashr i64 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g = lshr i64 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h = shl i64 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i = and i64 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j = or i64 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k = xor i64 undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %c = add i64 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %d = sub i64 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %e = mul i64 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %f = ashr i64 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %g = lshr i64 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %h = shl i64 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %i = and i64 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %j = or i64 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %k = xor i64 undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8M-BASE-LABEL: 'i64'
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c = add i64 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d = sub i64 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e = mul i64 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f = ashr i64 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g = lshr i64 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h = shl i64 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i = and i64 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j = or i64 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k = xor i64 undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %c = add i64 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %d = sub i64 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %e = mul i64 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %f = ashr i64 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %g = lshr i64 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %h = shl i64 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %i = and i64 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %j = or i64 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %k = xor i64 undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8R-LABEL: 'i64'
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c = add i64 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d = sub i64 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e = mul i64 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f = ashr i64 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g = lshr i64 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h = shl i64 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i = and i64 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j = or i64 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k = xor i64 undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'i64'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c = add i64 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d = sub i64 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e = mul i64 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f = ashr i64 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g = lshr i64 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h = shl i64 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i = and i64 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j = or i64 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k = xor i64 undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %c = add i64 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %d = sub i64 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %e = mul i64 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %f = ashr i64 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %g = lshr i64 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %h = shl i64 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %i = and i64 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %j = or i64 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %k = xor i64 undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c = add i64 undef, undef
%d = sub i64 undef, undef
@@ -377,277 +316,238 @@ define void @i64() {
define void @vi8() {
; CHECK-MVE1-LABEL: 'vi8'
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i2 = and <2 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j2 = or <2 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k2 = xor <2 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c4 = add <4 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d4 = sub <4 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e4 = mul <4 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f4 = ashr <4 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g4 = lshr <4 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h4 = shl <4 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i4 = and <4 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j4 = or <4 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k4 = xor <4 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c8 = add <8 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d8 = sub <8 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e8 = mul <8 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f8 = ashr <8 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g8 = lshr <8 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h8 = shl <8 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i8 = and <8 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j8 = or <8 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k8 = xor <8 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c16 = add <16 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d16 = sub <16 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e16 = mul <16 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f16 = ashr <16 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g16 = lshr <16 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h16 = shl <16 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i16 = and <16 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j16 = or <16 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k16 = xor <16 x i8> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %c2 = add <2 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %d2 = sub <2 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %e2 = mul <2 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %f2 = ashr <2 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %g2 = lshr <2 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %h2 = shl <2 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %i2 = and <2 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %j2 = or <2 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %k2 = xor <2 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %c4 = add <4 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %d4 = sub <4 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %e4 = mul <4 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %f4 = ashr <4 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %g4 = lshr <4 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %h4 = shl <4 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %i4 = and <4 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %j4 = or <4 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %k4 = xor <4 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %c8 = add <8 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %d8 = sub <8 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %e8 = mul <8 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %f8 = ashr <8 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %g8 = lshr <8 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %h8 = shl <8 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %i8 = and <8 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %j8 = or <8 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %k8 = xor <8 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %c16 = add <16 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %d16 = sub <16 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %e16 = mul <16 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %f16 = ashr <16 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %g16 = lshr <16 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %h16 = shl <16 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %i16 = and <16 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %j16 = or <16 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %k16 = xor <16 x i8> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVE2-LABEL: 'vi8'
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c4 = add <4 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d4 = sub <4 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e4 = mul <4 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f4 = ashr <4 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g4 = lshr <4 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h4 = shl <4 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i4 = and <4 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j4 = or <4 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k4 = xor <4 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c8 = add <8 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d8 = sub <8 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e8 = mul <8 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f8 = ashr <8 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g8 = lshr <8 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h8 = shl <8 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i8 = and <8 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j8 = or <8 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k8 = xor <8 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c16 = add <16 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d16 = sub <16 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e16 = mul <16 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = ashr <16 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g16 = lshr <16 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h16 = shl <16 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i16 = and <16 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j16 = or <16 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k16 = xor <16 x i8> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %c2 = add <2 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %d2 = sub <2 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %e2 = mul <2 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %f2 = ashr <2 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %g2 = lshr <2 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %h2 = shl <2 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %i2 = and <2 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %j2 = or <2 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %k2 = xor <2 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c4 = add <4 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d4 = sub <4 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e4 = mul <4 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %f4 = ashr <4 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %g4 = lshr <4 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %h4 = shl <4 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %i4 = and <4 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %j4 = or <4 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %k4 = xor <4 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c8 = add <8 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d8 = sub <8 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e8 = mul <8 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %f8 = ashr <8 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %g8 = lshr <8 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %h8 = shl <8 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %i8 = and <8 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %j8 = or <8 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %k8 = xor <8 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c16 = add <16 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d16 = sub <16 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e16 = mul <16 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %f16 = ashr <16 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %g16 = lshr <16 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %h16 = shl <16 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %i16 = and <16 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %j16 = or <16 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %k16 = xor <16 x i8> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVE4-LABEL: 'vi8'
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c4 = add <4 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d4 = sub <4 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e4 = mul <4 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f4 = ashr <4 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g4 = lshr <4 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h4 = shl <4 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i4 = and <4 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j4 = or <4 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k4 = xor <4 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = add <8 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d8 = sub <8 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e8 = mul <8 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f8 = ashr <8 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g8 = lshr <8 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h8 = shl <8 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i8 = and <8 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j8 = or <8 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k8 = xor <8 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c16 = add <16 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d16 = sub <16 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e16 = mul <16 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = ashr <16 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g16 = lshr <16 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h16 = shl <16 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = and <16 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j16 = or <16 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k16 = xor <16 x i8> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %c2 = add <2 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %d2 = sub <2 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %e2 = mul <2 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %f2 = ashr <2 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %g2 = lshr <2 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %h2 = shl <2 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %i2 = and <2 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %j2 = or <2 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %k2 = xor <2 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %c4 = add <4 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %d4 = sub <4 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %e4 = mul <4 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %f4 = ashr <4 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %g4 = lshr <4 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %h4 = shl <4 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %i4 = and <4 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %j4 = or <4 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %k4 = xor <4 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %c8 = add <8 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %d8 = sub <8 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %e8 = mul <8 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %f8 = ashr <8 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %g8 = lshr <8 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %h8 = shl <8 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %i8 = and <8 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %j8 = or <8 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %k8 = xor <8 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %c16 = add <16 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %d16 = sub <16 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %e16 = mul <16 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %f16 = ashr <16 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %g16 = lshr <16 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %h16 = shl <16 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %i16 = and <16 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %j16 = or <16 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %k16 = xor <16 x i8> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-V8M-MAIN-LABEL: 'vi8'
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c2 = add <2 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d2 = sub <2 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e2 = mul <2 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f2 = ashr <2 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g2 = lshr <2 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h2 = shl <2 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c4 = add <4 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d4 = sub <4 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e4 = mul <4 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f4 = ashr <4 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g4 = lshr <4 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h4 = shl <4 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i4 = and <4 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j4 = or <4 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k4 = xor <4 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c8 = add <8 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d8 = sub <8 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e8 = mul <8 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f8 = ashr <8 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g8 = lshr <8 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h8 = shl <8 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i8 = and <8 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j8 = or <8 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k8 = xor <8 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c16 = add <16 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d16 = sub <16 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e16 = mul <16 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %f16 = ashr <16 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %g16 = lshr <16 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %h16 = shl <16 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %i16 = and <16 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %j16 = or <16 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %k16 = xor <16 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %c2 = add <2 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %d2 = sub <2 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %e2 = mul <2 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %f2 = ashr <2 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %g2 = lshr <2 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %h2 = shl <2 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %i2 = and <2 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %j2 = or <2 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %k2 = xor <2 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %c4 = add <4 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %d4 = sub <4 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %e4 = mul <4 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %f4 = ashr <4 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %g4 = lshr <4 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %h4 = shl <4 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %i4 = and <4 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %j4 = or <4 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %k4 = xor <4 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %c8 = add <8 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %d8 = sub <8 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %e8 = mul <8 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %f8 = ashr <8 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %g8 = lshr <8 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %h8 = shl <8 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %i8 = and <8 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %j8 = or <8 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %k8 = xor <8 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %c16 = add <16 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %d16 = sub <16 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %e16 = mul <16 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %f16 = ashr <16 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %g16 = lshr <16 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %h16 = shl <16 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %i16 = and <16 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %j16 = or <16 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %k16 = xor <16 x i8> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8M-BASE-LABEL: 'vi8'
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c2 = add <2 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d2 = sub <2 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e2 = mul <2 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f2 = ashr <2 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g2 = lshr <2 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h2 = shl <2 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c4 = add <4 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d4 = sub <4 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e4 = mul <4 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f4 = ashr <4 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g4 = lshr <4 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h4 = shl <4 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i4 = and <4 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j4 = or <4 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k4 = xor <4 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c8 = add <8 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d8 = sub <8 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e8 = mul <8 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f8 = ashr <8 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g8 = lshr <8 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h8 = shl <8 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i8 = and <8 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j8 = or <8 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k8 = xor <8 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c16 = add <16 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d16 = sub <16 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e16 = mul <16 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %f16 = ashr <16 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %g16 = lshr <16 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %h16 = shl <16 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %i16 = and <16 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %j16 = or <16 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %k16 = xor <16 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %c2 = add <2 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %d2 = sub <2 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %e2 = mul <2 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %f2 = ashr <2 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %g2 = lshr <2 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %h2 = shl <2 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %i2 = and <2 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %j2 = or <2 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %k2 = xor <2 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %c4 = add <4 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %d4 = sub <4 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %e4 = mul <4 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %f4 = ashr <4 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %g4 = lshr <4 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %h4 = shl <4 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %i4 = and <4 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %j4 = or <4 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %k4 = xor <4 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %c8 = add <8 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %d8 = sub <8 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %e8 = mul <8 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %f8 = ashr <8 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %g8 = lshr <8 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %h8 = shl <8 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %i8 = and <8 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %j8 = or <8 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %k8 = xor <8 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %c16 = add <16 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %d16 = sub <16 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %e16 = mul <16 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %f16 = ashr <16 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %g16 = lshr <16 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %h16 = shl <16 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %i16 = and <16 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %j16 = or <16 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %k16 = xor <16 x i8> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8R-LABEL: 'vi8'
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c2 = add <2 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d2 = sub <2 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e2 = mul <2 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f2 = ashr <2 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g2 = lshr <2 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h2 = shl <2 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c4 = add <4 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d4 = sub <4 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e4 = mul <4 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f4 = ashr <4 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g4 = lshr <4 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h4 = shl <4 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i4 = and <4 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j4 = or <4 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k4 = xor <4 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = add <8 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d8 = sub <8 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e8 = mul <8 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f8 = ashr <8 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g8 = lshr <8 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h8 = shl <8 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i8 = and <8 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j8 = or <8 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k8 = xor <8 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c16 = add <16 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d16 = sub <16 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e16 = mul <16 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = ashr <16 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g16 = lshr <16 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h16 = shl <16 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = and <16 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j16 = or <16 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k16 = xor <16 x i8> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'vi8'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c4 = add <4 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d4 = sub <4 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e4 = mul <4 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f4 = ashr <4 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g4 = lshr <4 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h4 = shl <4 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i4 = and <4 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j4 = or <4 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k4 = xor <4 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = add <8 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d8 = sub <8 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e8 = mul <8 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f8 = ashr <8 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g8 = lshr <8 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h8 = shl <8 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i8 = and <8 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j8 = or <8 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k8 = xor <8 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c16 = add <16 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d16 = sub <16 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e16 = mul <16 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = ashr <16 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g16 = lshr <16 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h16 = shl <16 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i16 = and <16 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j16 = or <16 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k16 = xor <16 x i8> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c2 = add <2 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d2 = sub <2 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %e2 = mul <2 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %f2 = ashr <2 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %g2 = lshr <2 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %h2 = shl <2 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i2 = and <2 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j2 = or <2 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k2 = xor <2 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c4 = add <4 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d4 = sub <4 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %e4 = mul <4 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %f4 = ashr <4 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %g4 = lshr <4 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %h4 = shl <4 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i4 = and <4 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j4 = or <4 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k4 = xor <4 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c8 = add <8 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d8 = sub <8 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %e8 = mul <8 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %f8 = ashr <8 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %g8 = lshr <8 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %h8 = shl <8 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i8 = and <8 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j8 = or <8 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k8 = xor <8 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c16 = add <16 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d16 = sub <16 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %e16 = mul <16 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %f16 = ashr <16 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %g16 = lshr <16 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %h16 = shl <16 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i16 = and <16 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j16 = or <16 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k16 = xor <16 x i8> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c2 = add <2 x i8> undef, undef
%d2 = sub <2 x i8> undef, undef
@@ -690,277 +590,238 @@ define void @vi8() {
define void @vi16() {
; CHECK-MVE1-LABEL: 'vi16'
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i2 = and <2 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j2 = or <2 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k2 = xor <2 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c4 = add <4 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d4 = sub <4 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e4 = mul <4 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f4 = ashr <4 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g4 = lshr <4 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h4 = shl <4 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i4 = and <4 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j4 = or <4 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k4 = xor <4 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c8 = add <8 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d8 = sub <8 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e8 = mul <8 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f8 = ashr <8 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g8 = lshr <8 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h8 = shl <8 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i8 = and <8 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j8 = or <8 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k8 = xor <8 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c16 = add <16 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d16 = sub <16 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e16 = mul <16 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f16 = ashr <16 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g16 = lshr <16 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h16 = shl <16 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i16 = and <16 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j16 = or <16 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k16 = xor <16 x i16> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %c2 = add <2 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %d2 = sub <2 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %e2 = mul <2 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %f2 = ashr <2 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %g2 = lshr <2 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %h2 = shl <2 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %i2 = and <2 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %j2 = or <2 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %k2 = xor <2 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %c4 = add <4 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %d4 = sub <4 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %e4 = mul <4 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %f4 = ashr <4 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %g4 = lshr <4 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %h4 = shl <4 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %i4 = and <4 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %j4 = or <4 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %k4 = xor <4 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %c8 = add <8 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %d8 = sub <8 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %e8 = mul <8 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %f8 = ashr <8 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %g8 = lshr <8 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %h8 = shl <8 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %i8 = and <8 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %j8 = or <8 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %k8 = xor <8 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %c16 = add <16 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %d16 = sub <16 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %e16 = mul <16 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %f16 = ashr <16 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %g16 = lshr <16 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %h16 = shl <16 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %i16 = and <16 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %j16 = or <16 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %k16 = xor <16 x i16> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVE2-LABEL: 'vi16'
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c4 = add <4 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d4 = sub <4 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e4 = mul <4 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f4 = ashr <4 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g4 = lshr <4 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h4 = shl <4 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i4 = and <4 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j4 = or <4 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k4 = xor <4 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c8 = add <8 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d8 = sub <8 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e8 = mul <8 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f8 = ashr <8 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g8 = lshr <8 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h8 = shl <8 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i8 = and <8 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j8 = or <8 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k8 = xor <8 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c16 = add <16 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d16 = sub <16 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e16 = mul <16 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f16 = ashr <16 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g16 = lshr <16 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h16 = shl <16 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i16 = and <16 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j16 = or <16 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k16 = xor <16 x i16> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %c2 = add <2 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %d2 = sub <2 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %e2 = mul <2 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %f2 = ashr <2 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %g2 = lshr <2 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %h2 = shl <2 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %i2 = and <2 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %j2 = or <2 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %k2 = xor <2 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c4 = add <4 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d4 = sub <4 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e4 = mul <4 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %f4 = ashr <4 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %g4 = lshr <4 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %h4 = shl <4 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %i4 = and <4 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %j4 = or <4 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %k4 = xor <4 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c8 = add <8 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d8 = sub <8 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e8 = mul <8 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %f8 = ashr <8 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %g8 = lshr <8 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %h8 = shl <8 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %i8 = and <8 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %j8 = or <8 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %k8 = xor <8 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %c16 = add <16 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %d16 = sub <16 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %e16 = mul <16 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %f16 = ashr <16 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %g16 = lshr <16 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %h16 = shl <16 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %i16 = and <16 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %j16 = or <16 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %k16 = xor <16 x i16> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVE4-LABEL: 'vi16'
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c4 = add <4 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d4 = sub <4 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e4 = mul <4 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f4 = ashr <4 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g4 = lshr <4 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h4 = shl <4 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i4 = and <4 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j4 = or <4 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k4 = xor <4 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = add <8 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d8 = sub <8 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e8 = mul <8 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f8 = ashr <8 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g8 = lshr <8 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h8 = shl <8 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i8 = and <8 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j8 = or <8 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k8 = xor <8 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c16 = add <16 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d16 = sub <16 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e16 = mul <16 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = ashr <16 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g16 = lshr <16 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h16 = shl <16 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i16 = and <16 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j16 = or <16 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k16 = xor <16 x i16> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %c2 = add <2 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %d2 = sub <2 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %e2 = mul <2 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %f2 = ashr <2 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %g2 = lshr <2 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %h2 = shl <2 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %i2 = and <2 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %j2 = or <2 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %k2 = xor <2 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %c4 = add <4 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %d4 = sub <4 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %e4 = mul <4 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %f4 = ashr <4 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %g4 = lshr <4 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %h4 = shl <4 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %i4 = and <4 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %j4 = or <4 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %k4 = xor <4 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %c8 = add <8 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %d8 = sub <8 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %e8 = mul <8 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %f8 = ashr <8 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %g8 = lshr <8 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %h8 = shl <8 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %i8 = and <8 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %j8 = or <8 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %k8 = xor <8 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %c16 = add <16 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %d16 = sub <16 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %e16 = mul <16 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %f16 = ashr <16 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %g16 = lshr <16 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %h16 = shl <16 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %i16 = and <16 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %j16 = or <16 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %k16 = xor <16 x i16> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-V8M-MAIN-LABEL: 'vi16'
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c2 = add <2 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d2 = sub <2 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e2 = mul <2 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f2 = ashr <2 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g2 = lshr <2 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h2 = shl <2 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c4 = add <4 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d4 = sub <4 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e4 = mul <4 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f4 = ashr <4 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g4 = lshr <4 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h4 = shl <4 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i4 = and <4 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j4 = or <4 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k4 = xor <4 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c8 = add <8 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d8 = sub <8 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e8 = mul <8 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f8 = ashr <8 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g8 = lshr <8 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h8 = shl <8 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i8 = and <8 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j8 = or <8 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k8 = xor <8 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c16 = add <16 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d16 = sub <16 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e16 = mul <16 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %f16 = ashr <16 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %g16 = lshr <16 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %h16 = shl <16 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %i16 = and <16 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %j16 = or <16 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %k16 = xor <16 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %c2 = add <2 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %d2 = sub <2 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %e2 = mul <2 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %f2 = ashr <2 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %g2 = lshr <2 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %h2 = shl <2 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %i2 = and <2 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %j2 = or <2 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %k2 = xor <2 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %c4 = add <4 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %d4 = sub <4 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %e4 = mul <4 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %f4 = ashr <4 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %g4 = lshr <4 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %h4 = shl <4 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %i4 = and <4 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %j4 = or <4 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %k4 = xor <4 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %c8 = add <8 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %d8 = sub <8 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %e8 = mul <8 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %f8 = ashr <8 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %g8 = lshr <8 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %h8 = shl <8 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %i8 = and <8 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %j8 = or <8 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %k8 = xor <8 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %c16 = add <16 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %d16 = sub <16 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %e16 = mul <16 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %f16 = ashr <16 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %g16 = lshr <16 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %h16 = shl <16 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %i16 = and <16 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %j16 = or <16 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %k16 = xor <16 x i16> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8M-BASE-LABEL: 'vi16'
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c2 = add <2 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d2 = sub <2 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e2 = mul <2 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f2 = ashr <2 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g2 = lshr <2 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h2 = shl <2 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c4 = add <4 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d4 = sub <4 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e4 = mul <4 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f4 = ashr <4 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g4 = lshr <4 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h4 = shl <4 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i4 = and <4 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j4 = or <4 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k4 = xor <4 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c8 = add <8 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d8 = sub <8 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e8 = mul <8 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f8 = ashr <8 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g8 = lshr <8 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h8 = shl <8 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i8 = and <8 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j8 = or <8 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k8 = xor <8 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c16 = add <16 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d16 = sub <16 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e16 = mul <16 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %f16 = ashr <16 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %g16 = lshr <16 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %h16 = shl <16 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %i16 = and <16 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %j16 = or <16 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %k16 = xor <16 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %c2 = add <2 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %d2 = sub <2 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %e2 = mul <2 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %f2 = ashr <2 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %g2 = lshr <2 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %h2 = shl <2 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %i2 = and <2 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %j2 = or <2 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %k2 = xor <2 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %c4 = add <4 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %d4 = sub <4 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %e4 = mul <4 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %f4 = ashr <4 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %g4 = lshr <4 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %h4 = shl <4 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %i4 = and <4 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %j4 = or <4 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %k4 = xor <4 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %c8 = add <8 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %d8 = sub <8 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %e8 = mul <8 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %f8 = ashr <8 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %g8 = lshr <8 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %h8 = shl <8 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %i8 = and <8 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %j8 = or <8 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %k8 = xor <8 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %c16 = add <16 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %d16 = sub <16 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %e16 = mul <16 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %f16 = ashr <16 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %g16 = lshr <16 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %h16 = shl <16 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %i16 = and <16 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %j16 = or <16 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %k16 = xor <16 x i16> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8R-LABEL: 'vi16'
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c2 = add <2 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d2 = sub <2 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e2 = mul <2 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f2 = ashr <2 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g2 = lshr <2 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h2 = shl <2 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c4 = add <4 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d4 = sub <4 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e4 = mul <4 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f4 = ashr <4 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g4 = lshr <4 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h4 = shl <4 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i4 = and <4 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j4 = or <4 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k4 = xor <4 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = add <8 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d8 = sub <8 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e8 = mul <8 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f8 = ashr <8 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g8 = lshr <8 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h8 = shl <8 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i8 = and <8 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j8 = or <8 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k8 = xor <8 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c16 = add <16 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d16 = sub <16 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e16 = mul <16 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f16 = ashr <16 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g16 = lshr <16 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h16 = shl <16 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i16 = and <16 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j16 = or <16 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k16 = xor <16 x i16> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'vi16'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c4 = add <4 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d4 = sub <4 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e4 = mul <4 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f4 = ashr <4 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g4 = lshr <4 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h4 = shl <4 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i4 = and <4 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j4 = or <4 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k4 = xor <4 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c8 = add <8 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d8 = sub <8 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e8 = mul <8 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f8 = ashr <8 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g8 = lshr <8 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h8 = shl <8 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i8 = and <8 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j8 = or <8 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k8 = xor <8 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c16 = add <16 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d16 = sub <16 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e16 = mul <16 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = ashr <16 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g16 = lshr <16 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h16 = shl <16 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i16 = and <16 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j16 = or <16 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k16 = xor <16 x i16> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c2 = add <2 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d2 = sub <2 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %e2 = mul <2 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %f2 = ashr <2 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %g2 = lshr <2 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %h2 = shl <2 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i2 = and <2 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j2 = or <2 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k2 = xor <2 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c4 = add <4 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d4 = sub <4 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %e4 = mul <4 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %f4 = ashr <4 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %g4 = lshr <4 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %h4 = shl <4 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i4 = and <4 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j4 = or <4 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k4 = xor <4 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c8 = add <8 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d8 = sub <8 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %e8 = mul <8 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %f8 = ashr <8 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %g8 = lshr <8 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %h8 = shl <8 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i8 = and <8 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j8 = or <8 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k8 = xor <8 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %c16 = add <16 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %d16 = sub <16 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %e16 = mul <16 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %f16 = ashr <16 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %g16 = lshr <16 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %h16 = shl <16 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %i16 = and <16 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %j16 = or <16 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %k16 = xor <16 x i16> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c2 = add <2 x i16> undef, undef
%d2 = sub <2 x i16> undef, undef
@@ -1003,277 +864,238 @@ define void @vi16() {
define void @vi32() {
; CHECK-MVE1-LABEL: 'vi32'
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i2 = and <2 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j2 = or <2 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k2 = xor <2 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c4 = add <4 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d4 = sub <4 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e4 = mul <4 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f4 = ashr <4 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g4 = lshr <4 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h4 = shl <4 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i4 = and <4 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j4 = or <4 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k4 = xor <4 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c8 = add <8 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d8 = sub <8 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e8 = mul <8 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f8 = ashr <8 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g8 = lshr <8 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h8 = shl <8 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i8 = and <8 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j8 = or <8 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k8 = xor <8 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c16 = add <16 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d16 = sub <16 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e16 = mul <16 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %f16 = ashr <16 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %g16 = lshr <16 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %h16 = shl <16 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %i16 = and <16 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %j16 = or <16 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %k16 = xor <16 x i32> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %c2 = add <2 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %d2 = sub <2 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %e2 = mul <2 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %f2 = ashr <2 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %g2 = lshr <2 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 10 for: %h2 = shl <2 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %i2 = and <2 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %j2 = or <2 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %k2 = xor <2 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %c4 = add <4 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %d4 = sub <4 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %e4 = mul <4 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %f4 = ashr <4 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %g4 = lshr <4 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %h4 = shl <4 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %i4 = and <4 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %j4 = or <4 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %k4 = xor <4 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %c8 = add <8 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %d8 = sub <8 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %e8 = mul <8 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %f8 = ashr <8 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %g8 = lshr <8 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %h8 = shl <8 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %i8 = and <8 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %j8 = or <8 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %k8 = xor <8 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:16 SizeLat:16 for: %c16 = add <16 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:16 SizeLat:16 for: %d16 = sub <16 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:16 SizeLat:16 for: %e16 = mul <16 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:16 SizeLat:16 for: %f16 = ashr <16 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:16 SizeLat:16 for: %g16 = lshr <16 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:16 SizeLat:16 for: %h16 = shl <16 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:16 SizeLat:16 for: %i16 = and <16 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:16 SizeLat:16 for: %j16 = or <16 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:16 SizeLat:16 for: %k16 = xor <16 x i32> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVE2-LABEL: 'vi32'
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c4 = add <4 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d4 = sub <4 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e4 = mul <4 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f4 = ashr <4 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g4 = lshr <4 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h4 = shl <4 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i4 = and <4 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j4 = or <4 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k4 = xor <4 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c8 = add <8 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d8 = sub <8 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e8 = mul <8 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f8 = ashr <8 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g8 = lshr <8 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h8 = shl <8 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i8 = and <8 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j8 = or <8 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k8 = xor <8 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c16 = add <16 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d16 = sub <16 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e16 = mul <16 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f16 = ashr <16 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g16 = lshr <16 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h16 = shl <16 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i16 = and <16 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j16 = or <16 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k16 = xor <16 x i32> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %c2 = add <2 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %d2 = sub <2 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %e2 = mul <2 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %f2 = ashr <2 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %g2 = lshr <2 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 10 for: %h2 = shl <2 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %i2 = and <2 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %j2 = or <2 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %k2 = xor <2 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c4 = add <4 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d4 = sub <4 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e4 = mul <4 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %f4 = ashr <4 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %g4 = lshr <4 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %h4 = shl <4 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %i4 = and <4 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %j4 = or <4 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %k4 = xor <4 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %c8 = add <8 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %d8 = sub <8 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %e8 = mul <8 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %f8 = ashr <8 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %g8 = lshr <8 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %h8 = shl <8 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %i8 = and <8 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %j8 = or <8 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %k8 = xor <8 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %c16 = add <16 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %d16 = sub <16 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %e16 = mul <16 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %f16 = ashr <16 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %g16 = lshr <16 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %h16 = shl <16 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %i16 = and <16 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %j16 = or <16 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %k16 = xor <16 x i32> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVE4-LABEL: 'vi32'
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c4 = add <4 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d4 = sub <4 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e4 = mul <4 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f4 = ashr <4 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g4 = lshr <4 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h4 = shl <4 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i4 = and <4 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j4 = or <4 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k4 = xor <4 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c8 = add <8 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d8 = sub <8 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e8 = mul <8 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f8 = ashr <8 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g8 = lshr <8 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h8 = shl <8 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i8 = and <8 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j8 = or <8 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k8 = xor <8 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c16 = add <16 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d16 = sub <16 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e16 = mul <16 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f16 = ashr <16 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g16 = lshr <16 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h16 = shl <16 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i16 = and <16 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j16 = or <16 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k16 = xor <16 x i32> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %c2 = add <2 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %d2 = sub <2 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %e2 = mul <2 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %f2 = ashr <2 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %g2 = lshr <2 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 10 for: %h2 = shl <2 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %i2 = and <2 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %j2 = or <2 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %k2 = xor <2 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %c4 = add <4 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %d4 = sub <4 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %e4 = mul <4 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %f4 = ashr <4 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %g4 = lshr <4 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %h4 = shl <4 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %i4 = and <4 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %j4 = or <4 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %k4 = xor <4 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %c8 = add <8 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %d8 = sub <8 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %e8 = mul <8 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %f8 = ashr <8 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %g8 = lshr <8 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %h8 = shl <8 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %i8 = and <8 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %j8 = or <8 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %k8 = xor <8 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 4 for: %c16 = add <16 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 4 for: %d16 = sub <16 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 4 for: %e16 = mul <16 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 4 for: %f16 = ashr <16 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 4 for: %g16 = lshr <16 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 4 for: %h16 = shl <16 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 4 for: %i16 = and <16 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 4 for: %j16 = or <16 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 4 for: %k16 = xor <16 x i32> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-V8M-MAIN-LABEL: 'vi32'
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c2 = add <2 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d2 = sub <2 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e2 = mul <2 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f2 = ashr <2 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g2 = lshr <2 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h2 = shl <2 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c4 = add <4 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d4 = sub <4 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e4 = mul <4 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f4 = ashr <4 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g4 = lshr <4 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h4 = shl <4 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i4 = and <4 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j4 = or <4 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k4 = xor <4 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c8 = add <8 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d8 = sub <8 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e8 = mul <8 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f8 = ashr <8 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g8 = lshr <8 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h8 = shl <8 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i8 = and <8 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j8 = or <8 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k8 = xor <8 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c16 = add <16 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d16 = sub <16 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e16 = mul <16 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %f16 = ashr <16 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %g16 = lshr <16 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %h16 = shl <16 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %i16 = and <16 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %j16 = or <16 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %k16 = xor <16 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %c2 = add <2 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %d2 = sub <2 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %e2 = mul <2 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %f2 = ashr <2 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %g2 = lshr <2 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %h2 = shl <2 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %i2 = and <2 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %j2 = or <2 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 2 for: %k2 = xor <2 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %c4 = add <4 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %d4 = sub <4 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %e4 = mul <4 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %f4 = ashr <4 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %g4 = lshr <4 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %h4 = shl <4 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %i4 = and <4 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %j4 = or <4 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %k4 = xor <4 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %c8 = add <8 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %d8 = sub <8 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %e8 = mul <8 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %f8 = ashr <8 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %g8 = lshr <8 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %h8 = shl <8 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %i8 = and <8 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %j8 = or <8 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %k8 = xor <8 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %c16 = add <16 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %d16 = sub <16 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %e16 = mul <16 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %f16 = ashr <16 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %g16 = lshr <16 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %h16 = shl <16 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %i16 = and <16 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %j16 = or <16 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %k16 = xor <16 x i32> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8M-BASE-LABEL: 'vi32'
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c2 = add <2 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d2 = sub <2 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e2 = mul <2 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f2 = ashr <2 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g2 = lshr <2 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h2 = shl <2 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c4 = add <4 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d4 = sub <4 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e4 = mul <4 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f4 = ashr <4 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g4 = lshr <4 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h4 = shl <4 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i4 = and <4 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j4 = or <4 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k4 = xor <4 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c8 = add <8 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d8 = sub <8 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e8 = mul <8 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f8 = ashr <8 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g8 = lshr <8 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h8 = shl <8 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i8 = and <8 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j8 = or <8 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k8 = xor <8 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c16 = add <16 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d16 = sub <16 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e16 = mul <16 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %f16 = ashr <16 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %g16 = lshr <16 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %h16 = shl <16 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %i16 = and <16 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %j16 = or <16 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %k16 = xor <16 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %c2 = add <2 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %d2 = sub <2 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %e2 = mul <2 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %f2 = ashr <2 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %g2 = lshr <2 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %h2 = shl <2 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %i2 = and <2 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %j2 = or <2 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 2 for: %k2 = xor <2 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %c4 = add <4 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %d4 = sub <4 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %e4 = mul <4 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %f4 = ashr <4 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %g4 = lshr <4 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %h4 = shl <4 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %i4 = and <4 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %j4 = or <4 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %k4 = xor <4 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %c8 = add <8 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %d8 = sub <8 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %e8 = mul <8 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %f8 = ashr <8 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %g8 = lshr <8 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %h8 = shl <8 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %i8 = and <8 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %j8 = or <8 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %k8 = xor <8 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %c16 = add <16 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %d16 = sub <16 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %e16 = mul <16 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %f16 = ashr <16 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %g16 = lshr <16 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %h16 = shl <16 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %i16 = and <16 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %j16 = or <16 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %k16 = xor <16 x i32> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8R-LABEL: 'vi32'
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c2 = add <2 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d2 = sub <2 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e2 = mul <2 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f2 = ashr <2 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g2 = lshr <2 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h2 = shl <2 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c4 = add <4 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d4 = sub <4 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e4 = mul <4 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f4 = ashr <4 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g4 = lshr <4 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h4 = shl <4 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i4 = and <4 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j4 = or <4 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k4 = xor <4 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c8 = add <8 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d8 = sub <8 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e8 = mul <8 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f8 = ashr <8 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g8 = lshr <8 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h8 = shl <8 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i8 = and <8 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j8 = or <8 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k8 = xor <8 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c16 = add <16 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d16 = sub <16 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e16 = mul <16 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f16 = ashr <16 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g16 = lshr <16 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h16 = shl <16 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i16 = and <16 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j16 = or <16 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k16 = xor <16 x i32> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'vi32'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c4 = add <4 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d4 = sub <4 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e4 = mul <4 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f4 = ashr <4 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g4 = lshr <4 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h4 = shl <4 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i4 = and <4 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j4 = or <4 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k4 = xor <4 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c8 = add <8 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d8 = sub <8 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e8 = mul <8 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f8 = ashr <8 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g8 = lshr <8 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h8 = shl <8 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i8 = and <8 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j8 = or <8 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k8 = xor <8 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c16 = add <16 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d16 = sub <16 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e16 = mul <16 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f16 = ashr <16 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g16 = lshr <16 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h16 = shl <16 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i16 = and <16 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j16 = or <16 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k16 = xor <16 x i32> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c2 = add <2 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d2 = sub <2 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %e2 = mul <2 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %f2 = ashr <2 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %g2 = lshr <2 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %h2 = shl <2 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i2 = and <2 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j2 = or <2 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k2 = xor <2 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c4 = add <4 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d4 = sub <4 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %e4 = mul <4 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %f4 = ashr <4 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %g4 = lshr <4 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %h4 = shl <4 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i4 = and <4 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j4 = or <4 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k4 = xor <4 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %c8 = add <8 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %d8 = sub <8 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %e8 = mul <8 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %f8 = ashr <8 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %g8 = lshr <8 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %h8 = shl <8 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %i8 = and <8 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %j8 = or <8 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %k8 = xor <8 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %c16 = add <16 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %d16 = sub <16 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %e16 = mul <16 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %f16 = ashr <16 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %g16 = lshr <16 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %h16 = shl <16 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %i16 = and <16 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %j16 = or <16 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %k16 = xor <16 x i32> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c2 = add <2 x i32> undef, undef
%d2 = sub <2 x i32> undef, undef
@@ -1316,277 +1138,238 @@ define void @vi32() {
define void @vi64() {
; CHECK-MVE1-LABEL: 'vi64'
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %c2 = add <2 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %d2 = sub <2 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %e2 = mul <2 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = ashr <2 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %g2 = lshr <2 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %h2 = shl <2 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i2 = and <2 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j2 = or <2 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k2 = xor <2 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %c4 = add <4 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %d4 = sub <4 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e4 = mul <4 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %f4 = ashr <4 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %g4 = lshr <4 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %h4 = shl <4 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i4 = and <4 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j4 = or <4 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k4 = xor <4 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %c8 = add <8 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %d8 = sub <8 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %e8 = mul <8 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %f8 = ashr <8 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %g8 = lshr <8 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %h8 = shl <8 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %i8 = and <8 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %j8 = or <8 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %k8 = xor <8 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %c16 = add <16 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %d16 = sub <16 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %e16 = mul <16 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %f16 = ashr <16 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %g16 = lshr <16 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %h16 = shl <16 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %i16 = and <16 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %j16 = or <16 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %k16 = xor <16 x i64> undef, undef
-; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 20 for: %c2 = add <2 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 20 for: %d2 = sub <2 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 20 for: %e2 = mul <2 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 20 for: %f2 = ashr <2 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 20 for: %g2 = lshr <2 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 20 for: %h2 = shl <2 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %i2 = and <2 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %j2 = or <2 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %k2 = xor <2 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 40 for: %c4 = add <4 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 40 for: %d4 = sub <4 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 40 for: %e4 = mul <4 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 40 for: %f4 = ashr <4 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 40 for: %g4 = lshr <4 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 40 for: %h4 = shl <4 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %i4 = and <4 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %j4 = or <4 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:8 SizeLat:8 for: %k4 = xor <4 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 80 for: %c8 = add <8 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 80 for: %d8 = sub <8 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 80 for: %e8 = mul <8 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 80 for: %f8 = ashr <8 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 80 for: %g8 = lshr <8 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 80 for: %h8 = shl <8 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:16 SizeLat:16 for: %i8 = and <8 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:16 SizeLat:16 for: %j8 = or <8 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:16 SizeLat:16 for: %k8 = xor <8 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 160 for: %c16 = add <16 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 160 for: %d16 = sub <16 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 160 for: %e16 = mul <16 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 160 for: %f16 = ashr <16 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 160 for: %g16 = lshr <16 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of 160 for: %h16 = shl <16 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:32 CodeSize:8 Lat:32 SizeLat:32 for: %i16 = and <16 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:32 CodeSize:8 Lat:32 SizeLat:32 for: %j16 = or <16 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:32 CodeSize:8 Lat:32 SizeLat:32 for: %k16 = xor <16 x i64> undef, undef
+; CHECK-MVE1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVE2-LABEL: 'vi64'
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %c2 = add <2 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %d2 = sub <2 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %e2 = mul <2 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = ashr <2 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %g2 = lshr <2 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %h2 = shl <2 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %c4 = add <4 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %d4 = sub <4 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e4 = mul <4 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %f4 = ashr <4 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %g4 = lshr <4 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %h4 = shl <4 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i4 = and <4 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j4 = or <4 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k4 = xor <4 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %c8 = add <8 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %d8 = sub <8 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %e8 = mul <8 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %f8 = ashr <8 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %g8 = lshr <8 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %h8 = shl <8 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i8 = and <8 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j8 = or <8 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k8 = xor <8 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %c16 = add <16 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %d16 = sub <16 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %e16 = mul <16 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %f16 = ashr <16 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %g16 = lshr <16 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %h16 = shl <16 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %i16 = and <16 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %j16 = or <16 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %k16 = xor <16 x i64> undef, undef
-; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 20 for: %c2 = add <2 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 20 for: %d2 = sub <2 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 20 for: %e2 = mul <2 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 20 for: %f2 = ashr <2 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 20 for: %g2 = lshr <2 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 20 for: %h2 = shl <2 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %i2 = and <2 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %j2 = or <2 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %k2 = xor <2 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 40 for: %c4 = add <4 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 40 for: %d4 = sub <4 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 40 for: %e4 = mul <4 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 40 for: %f4 = ashr <4 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 40 for: %g4 = lshr <4 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 40 for: %h4 = shl <4 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %i4 = and <4 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %j4 = or <4 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %k4 = xor <4 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 80 for: %c8 = add <8 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 80 for: %d8 = sub <8 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 80 for: %e8 = mul <8 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 80 for: %f8 = ashr <8 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 80 for: %g8 = lshr <8 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 80 for: %h8 = shl <8 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %i8 = and <8 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %j8 = or <8 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %k8 = xor <8 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 160 for: %c16 = add <16 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 160 for: %d16 = sub <16 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 160 for: %e16 = mul <16 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 160 for: %f16 = ashr <16 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 160 for: %g16 = lshr <16 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of 160 for: %h16 = shl <16 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %i16 = and <16 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %j16 = or <16 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %k16 = xor <16 x i64> undef, undef
+; CHECK-MVE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-MVE4-LABEL: 'vi64'
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %c2 = add <2 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %d2 = sub <2 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %e2 = mul <2 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = ashr <2 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %g2 = lshr <2 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %h2 = shl <2 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %c4 = add <4 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %d4 = sub <4 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e4 = mul <4 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %f4 = ashr <4 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %g4 = lshr <4 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %h4 = shl <4 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i4 = and <4 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j4 = or <4 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k4 = xor <4 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %c8 = add <8 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %d8 = sub <8 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %e8 = mul <8 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %f8 = ashr <8 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %g8 = lshr <8 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %h8 = shl <8 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i8 = and <8 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j8 = or <8 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k8 = xor <8 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %c16 = add <16 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %d16 = sub <16 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %e16 = mul <16 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %f16 = ashr <16 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %g16 = lshr <16 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %h16 = shl <16 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i16 = and <16 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j16 = or <16 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k16 = xor <16 x i64> undef, undef
-; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 20 for: %c2 = add <2 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 20 for: %d2 = sub <2 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 20 for: %e2 = mul <2 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 20 for: %f2 = ashr <2 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 20 for: %g2 = lshr <2 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 20 for: %h2 = shl <2 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %i2 = and <2 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %j2 = or <2 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 1 for: %k2 = xor <2 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 40 for: %c4 = add <4 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 40 for: %d4 = sub <4 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 40 for: %e4 = mul <4 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 40 for: %f4 = ashr <4 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 40 for: %g4 = lshr <4 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 40 for: %h4 = shl <4 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %i4 = and <4 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %j4 = or <4 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 2 for: %k4 = xor <4 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 80 for: %c8 = add <8 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 80 for: %d8 = sub <8 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 80 for: %e8 = mul <8 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 80 for: %f8 = ashr <8 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 80 for: %g8 = lshr <8 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 80 for: %h8 = shl <8 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 4 for: %i8 = and <8 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 4 for: %j8 = or <8 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 4 for: %k8 = xor <8 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 160 for: %c16 = add <16 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 160 for: %d16 = sub <16 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 160 for: %e16 = mul <16 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 160 for: %f16 = ashr <16 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 160 for: %g16 = lshr <16 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 160 for: %h16 = shl <16 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 8 for: %i16 = and <16 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 8 for: %j16 = or <16 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of 8 for: %k16 = xor <16 x i64> undef, undef
+; CHECK-MVE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-V8M-MAIN-LABEL: 'vi64'
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = add <2 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = sub <2 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = mul <2 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f2 = ashr <2 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g2 = lshr <2 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h2 = shl <2 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i2 = and <2 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j2 = or <2 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k2 = xor <2 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = add <4 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = sub <4 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = mul <4 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f4 = ashr <4 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g4 = lshr <4 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h4 = shl <4 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i4 = and <4 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j4 = or <4 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k4 = xor <4 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = add <8 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = sub <8 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = mul <8 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %f8 = ashr <8 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %g8 = lshr <8 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %h8 = shl <8 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %i8 = and <8 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %j8 = or <8 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %k8 = xor <8 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %c16 = add <16 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %d16 = sub <16 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %e16 = mul <16 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %f16 = ashr <16 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %g16 = lshr <16 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %h16 = shl <16 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %i16 = and <16 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %j16 = or <16 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %k16 = xor <16 x i64> undef, undef
-; CHECK-V8M-MAIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %c2 = add <2 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %d2 = sub <2 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %e2 = mul <2 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %f2 = ashr <2 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %g2 = lshr <2 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %h2 = shl <2 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %i2 = and <2 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %j2 = or <2 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 4 for: %k2 = xor <2 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %c4 = add <4 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %d4 = sub <4 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %e4 = mul <4 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %f4 = ashr <4 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %g4 = lshr <4 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %h4 = shl <4 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %i4 = and <4 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %j4 = or <4 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 8 for: %k4 = xor <4 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %c8 = add <8 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %d8 = sub <8 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %e8 = mul <8 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %f8 = ashr <8 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %g8 = lshr <8 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %h8 = shl <8 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %i8 = and <8 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %j8 = or <8 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 16 for: %k8 = xor <8 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 32 for: %c16 = add <16 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 32 for: %d16 = sub <16 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 32 for: %e16 = mul <16 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 32 for: %f16 = ashr <16 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 32 for: %g16 = lshr <16 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 32 for: %h16 = shl <16 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 32 for: %i16 = and <16 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 32 for: %j16 = or <16 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 32 for: %k16 = xor <16 x i64> undef, undef
+; CHECK-V8M-MAIN-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8M-BASE-LABEL: 'vi64'
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = add <2 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = sub <2 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = mul <2 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f2 = ashr <2 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g2 = lshr <2 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h2 = shl <2 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i2 = and <2 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j2 = or <2 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k2 = xor <2 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = add <4 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = sub <4 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = mul <4 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f4 = ashr <4 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g4 = lshr <4 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h4 = shl <4 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i4 = and <4 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j4 = or <4 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k4 = xor <4 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = add <8 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = sub <8 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = mul <8 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %f8 = ashr <8 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %g8 = lshr <8 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %h8 = shl <8 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %i8 = and <8 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %j8 = or <8 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %k8 = xor <8 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %c16 = add <16 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %d16 = sub <16 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %e16 = mul <16 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %f16 = ashr <16 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %g16 = lshr <16 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %h16 = shl <16 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %i16 = and <16 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %j16 = or <16 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %k16 = xor <16 x i64> undef, undef
-; CHECK-V8M-BASE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %c2 = add <2 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %d2 = sub <2 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %e2 = mul <2 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %f2 = ashr <2 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %g2 = lshr <2 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %h2 = shl <2 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %i2 = and <2 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %j2 = or <2 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 4 for: %k2 = xor <2 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %c4 = add <4 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %d4 = sub <4 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %e4 = mul <4 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %f4 = ashr <4 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %g4 = lshr <4 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %h4 = shl <4 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %i4 = and <4 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %j4 = or <4 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 8 for: %k4 = xor <4 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %c8 = add <8 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %d8 = sub <8 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %e8 = mul <8 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %f8 = ashr <8 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %g8 = lshr <8 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %h8 = shl <8 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %i8 = and <8 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %j8 = or <8 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 16 for: %k8 = xor <8 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 32 for: %c16 = add <16 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 32 for: %d16 = sub <16 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 32 for: %e16 = mul <16 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 32 for: %f16 = ashr <16 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 32 for: %g16 = lshr <16 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 32 for: %h16 = shl <16 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 32 for: %i16 = and <16 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 32 for: %j16 = or <16 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 32 for: %k16 = xor <16 x i64> undef, undef
+; CHECK-V8M-BASE-NEXT: Cost Model: Found costs of 1 for: ret void
;
; CHECK-V8R-LABEL: 'vi64'
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c2 = add <2 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d2 = sub <2 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e2 = mul <2 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f2 = ashr <2 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %g2 = lshr <2 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %h2 = shl <2 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c4 = add <4 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d4 = sub <4 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e4 = mul <4 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f4 = ashr <4 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g4 = lshr <4 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h4 = shl <4 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i4 = and <4 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j4 = or <4 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k4 = xor <4 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c8 = add <8 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d8 = sub <8 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e8 = mul <8 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f8 = ashr <8 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g8 = lshr <8 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h8 = shl <8 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i8 = and <8 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j8 = or <8 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k8 = xor <8 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c16 = add <16 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d16 = sub <16 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e16 = mul <16 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %f16 = ashr <16 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %g16 = lshr <16 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %h16 = shl <16 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i16 = and <16 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j16 = or <16 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k16 = xor <16 x i64> undef, undef
-; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'vi64'
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %c2 = add <2 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %d2 = sub <2 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %e2 = mul <2 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = ashr <2 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %g2 = lshr <2 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %h2 = shl <2 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %c4 = add <4 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %d4 = sub <4 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e4 = mul <4 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %f4 = ashr <4 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %g4 = lshr <4 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %h4 = shl <4 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i4 = and <4 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j4 = or <4 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k4 = xor <4 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %c8 = add <8 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %d8 = sub <8 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %e8 = mul <8 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %f8 = ashr <8 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %g8 = lshr <8 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %h8 = shl <8 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i8 = and <8 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j8 = or <8 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k8 = xor <8 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %c16 = add <16 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %d16 = sub <16 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %e16 = mul <16 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %f16 = ashr <16 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %g16 = lshr <16 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %h16 = shl <16 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i16 = and <16 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j16 = or <16 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k16 = xor <16 x i64> undef, undef
-; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %c2 = add <2 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %d2 = sub <2 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %e2 = mul <2 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %f2 = ashr <2 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %g2 = lshr <2 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %h2 = shl <2 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %i2 = and <2 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %j2 = or <2 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of 1 for: %k2 = xor <2 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %c4 = add <4 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %d4 = sub <4 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %e4 = mul <4 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %f4 = ashr <4 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %g4 = lshr <4 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %h4 = shl <4 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %i4 = and <4 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %j4 = or <4 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %k4 = xor <4 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %c8 = add <8 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %d8 = sub <8 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %e8 = mul <8 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %f8 = ashr <8 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %g8 = lshr <8 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %h8 = shl <8 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %i8 = and <8 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %j8 = or <8 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %k8 = xor <8 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %c16 = add <16 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %d16 = sub <16 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %e16 = mul <16 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %f16 = ashr <16 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %g16 = lshr <16 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %h16 = shl <16 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %i16 = and <16 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %j16 = or <16 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %k16 = xor <16 x i64> undef, undef
+; CHECK-V8R-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%c2 = add <2 x i64> undef, undef
%d2 = sub <2 x i64> undef, undef
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast-sat.ll b/llvm/test/Analysis/CostModel/RISCV/cast-sat.ll
index 8b870d3..ee70811 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast-sat.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast-sat.ll
@@ -1,192 +1,194 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=riscv64 -mattr=+zve32f,+zvl128b,+f,+d,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64ZVE32F
-; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zvl128b,+f,+d,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64V
+; RUN: opt < %s -mtriple=riscv64 -mattr=+zve32f,+zvl128b,+f,+d,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=RV64ZVE32F
+; RUN: opt < %s -mtriple=riscv64 -mattr=+zve32f,+zvl128b,+f,+d,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput -intrinsic-cost-strategy=type-based-intrinsic-cost 2>&1 -disable-output | FileCheck %s --check-prefixes=RV64ZVE32F
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zvl128b,+f,+d,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=RV64V
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zvl128b,+f,+d,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput -intrinsic-cost-strategy=type-based-intrinsic-cost 2>&1 -disable-output | FileCheck %s --check-prefixes=RV64V
define void @fptoui_sat() {
; RV64ZVE32F-LABEL: 'fptoui_sat'
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i8 = call <1 x i8> @llvm.fptoui.sat.v1i8.v1f32(<1 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i8 = call <1 x i8> @llvm.fptoui.sat.v1i8.v1f64(<1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i16 = call <1 x i16> @llvm.fptoui.sat.v1i16.v1f32(<1 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i16 = call <1 x i16> @llvm.fptoui.sat.v1i16.v1f64(<1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i32 = call <1 x i32> @llvm.fptoui.sat.v1i32.v1f32(<1 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i32 = call <1 x i32> @llvm.fptoui.sat.v1i32.v1f64(<1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i64 = call <1 x i64> @llvm.fptoui.sat.v1i64.v1f32(<1 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i64 = call <1 x i64> @llvm.fptoui.sat.v1i64.v1f64(<1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i1 = call <1 x i1> @llvm.fptoui.sat.v1i1.v1f32(<1 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i1 = call <1 x i1> @llvm.fptoui.sat.v1i1.v1f64(<1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32_v2i64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64_v2i64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32_v4i64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64_v4i64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32_v8i64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f64_v8i64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.fptoui.sat.nxv1i8.nxv1f32(<vscale x 1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v1f32_v1i8 = call <1 x i8> @llvm.fptoui.sat.v1i8.v1f32(<1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f64_v1i8 = call <1 x i8> @llvm.fptoui.sat.v1i8.v1f64(<1 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f32_v1i16 = call <1 x i16> @llvm.fptoui.sat.v1i16.v1f32(<1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f64_v1i16 = call <1 x i16> @llvm.fptoui.sat.v1i16.v1f64(<1 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f32_v1i32 = call <1 x i32> @llvm.fptoui.sat.v1i32.v1f32(<1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f64_v1i32 = call <1 x i32> @llvm.fptoui.sat.v1i32.v1f64(<1 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f32_v1i64 = call <1 x i64> @llvm.fptoui.sat.v1i64.v1f32(<1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1f64_v1i64 = call <1 x i64> @llvm.fptoui.sat.v1i64.v1f64(<1 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v1f32_v1i1 = call <1 x i1> @llvm.fptoui.sat.v1i1.v1f32(<1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v1f64_v1i1 = call <1 x i1> @llvm.fptoui.sat.v1i1.v1f64(<1 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32_v2i8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f64_v2i8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f32_v2i16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f64_v2i16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f32_v2i32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f64_v2i32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f32_v2i64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f64_v2i64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2f32_v2i1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2f64_v2i1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32_v4i8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4f64_v4i8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4f32_v4i16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4f64_v4i16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4f32_v4i32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4f64_v4i32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v4f32_v4i64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4f64_v4i64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4f32_v4i1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v4f64_v4i1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32_v8i8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8f64_v8i8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8f32_v8i16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8f64_v8i16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8f32_v8i32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %v8f64_v8i32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v8f32_v8i64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v8f64_v8i64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8f32_v8i1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %v8f64_v8i1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.fptoui.sat.nxv1i8.nxv1f32(<vscale x 1 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.fptoui.sat.nxv1i8.nxv1f64(<vscale x 1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.fptoui.sat.nxv1i16.nxv1f32(<vscale x 1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.fptoui.sat.nxv1i16.nxv1f32(<vscale x 1 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.fptoui.sat.nxv1i16.nxv1f64(<vscale x 1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.fptoui.sat.nxv1i32.nxv1f32(<vscale x 1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.fptoui.sat.nxv1i32.nxv1f32(<vscale x 1 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.fptoui.sat.nxv1i32.nxv1f64(<vscale x 1 x double> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.fptoui.sat.nxv1i64.nxv1f32(<vscale x 1 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.fptoui.sat.nxv1i64.nxv1f64(<vscale x 1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.fptoui.sat.nxv1i1.nxv1f32(<vscale x 1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.fptoui.sat.nxv1i1.nxv1f32(<vscale x 1 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.fptoui.sat.nxv1i1.nxv1f64(<vscale x 1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.fptoui.sat.nxv2i8.nxv2f32(<vscale x 2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.fptoui.sat.nxv2i8.nxv2f32(<vscale x 2 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.fptoui.sat.nxv2i8.nxv2f64(<vscale x 2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.fptoui.sat.nxv2i16.nxv2f32(<vscale x 2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.fptoui.sat.nxv2i16.nxv2f32(<vscale x 2 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.fptoui.sat.nxv2i16.nxv2f64(<vscale x 2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2i32.nxv2f32(<vscale x 2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2i32.nxv2f32(<vscale x 2 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2i32.nxv2f64(<vscale x 2 x double> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.fptoui.sat.nxv2i64.nxv2f32(<vscale x 2 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.fptoui.sat.nxv2i64.nxv2f64(<vscale x 2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.fptoui.sat.nxv2i1.nxv2f32(<vscale x 2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.fptoui.sat.nxv2i1.nxv2f32(<vscale x 2 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.fptoui.sat.nxv2i1.nxv2f64(<vscale x 2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.fptoui.sat.nxv4i8.nxv4f32(<vscale x 4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.fptoui.sat.nxv4i8.nxv4f32(<vscale x 4 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.fptoui.sat.nxv4i8.nxv4f64(<vscale x 4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4i16.nxv4f32(<vscale x 4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4i16.nxv4f32(<vscale x 4 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4i16.nxv4f64(<vscale x 4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.fptoui.sat.nxv4i32.nxv4f32(<vscale x 4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.fptoui.sat.nxv4i32.nxv4f32(<vscale x 4 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.fptoui.sat.nxv4i32.nxv4f64(<vscale x 4 x double> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4i64.nxv4f32(<vscale x 4 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4i64.nxv4f64(<vscale x 4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.fptoui.sat.nxv4i1.nxv4f32(<vscale x 4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.fptoui.sat.nxv4i1.nxv4f32(<vscale x 4 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.fptoui.sat.nxv4i1.nxv4f64(<vscale x 4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.fptoui.sat.nxv8i8.nxv8f32(<vscale x 8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.fptoui.sat.nxv8i8.nxv8f32(<vscale x 8 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.fptoui.sat.nxv8i8.nxv8f64(<vscale x 8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.fptoui.sat.nxv8i16.nxv8f32(<vscale x 8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.fptoui.sat.nxv8i16.nxv8f32(<vscale x 8 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.fptoui.sat.nxv8i16.nxv8f64(<vscale x 8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.fptoui.sat.nxv8i32.nxv8f32(<vscale x 8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.fptoui.sat.nxv8i32.nxv8f32(<vscale x 8 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.fptoui.sat.nxv8i32.nxv8f64(<vscale x 8 x double> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.fptoui.sat.nxv8i64.nxv8f32(<vscale x 8 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.fptoui.sat.nxv8i64.nxv8f64(<vscale x 8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.fptoui.sat.nxv8i1.nxv8f32(<vscale x 8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.fptoui.sat.nxv8i1.nxv8f32(<vscale x 8 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.fptoui.sat.nxv8i1.nxv8f64(<vscale x 8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.fptoui.sat.nxv16i8.nxv16f32(<vscale x 16 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.fptoui.sat.nxv16i8.nxv16f32(<vscale x 16 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.fptoui.sat.nxv16i8.nxv16f64(<vscale x 16 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.fptoui.sat.nxv16i16.nxv16f32(<vscale x 16 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.fptoui.sat.nxv16i16.nxv16f32(<vscale x 16 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.fptoui.sat.nxv16i16.nxv16f64(<vscale x 16 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.fptoui.sat.nxv16i32.nxv16f32(<vscale x 16 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.fptoui.sat.nxv16i32.nxv16f32(<vscale x 16 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.fptoui.sat.nxv16i32.nxv16f64(<vscale x 16 x double> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.fptoui.sat.nxv16i64.nxv16f32(<vscale x 16 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.fptoui.sat.nxv16i64.nxv16f64(<vscale x 16 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.fptoui.sat.nxv16i1.nxv16f32(<vscale x 16 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.fptoui.sat.nxv16i1.nxv16f32(<vscale x 16 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.fptoui.sat.nxv16i1.nxv16f64(<vscale x 16 x double> poison)
; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; RV64V-LABEL: 'fptoui_sat'
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i8 = call <1 x i8> @llvm.fptoui.sat.v1i8.v1f32(<1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i8 = call <1 x i8> @llvm.fptoui.sat.v1i8.v1f64(<1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i16 = call <1 x i16> @llvm.fptoui.sat.v1i16.v1f32(<1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i16 = call <1 x i16> @llvm.fptoui.sat.v1i16.v1f64(<1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i32 = call <1 x i32> @llvm.fptoui.sat.v1i32.v1f32(<1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i32 = call <1 x i32> @llvm.fptoui.sat.v1i32.v1f64(<1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i64 = call <1 x i64> @llvm.fptoui.sat.v1i64.v1f32(<1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i64 = call <1 x i64> @llvm.fptoui.sat.v1i64.v1f64(<1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i1 = call <1 x i1> @llvm.fptoui.sat.v1i1.v1f32(<1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i1 = call <1 x i1> @llvm.fptoui.sat.v1i1.v1f64(<1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.fptoui.sat.nxv1i8.nxv1f32(<vscale x 1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.fptoui.sat.nxv1i8.nxv1f64(<vscale x 1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.fptoui.sat.nxv1i16.nxv1f32(<vscale x 1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.fptoui.sat.nxv1i16.nxv1f64(<vscale x 1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.fptoui.sat.nxv1i32.nxv1f32(<vscale x 1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.fptoui.sat.nxv1i32.nxv1f64(<vscale x 1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.fptoui.sat.nxv1i64.nxv1f32(<vscale x 1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.fptoui.sat.nxv1i64.nxv1f64(<vscale x 1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.fptoui.sat.nxv1i1.nxv1f32(<vscale x 1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.fptoui.sat.nxv1i1.nxv1f64(<vscale x 1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.fptoui.sat.nxv2i8.nxv2f32(<vscale x 2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.fptoui.sat.nxv2i8.nxv2f64(<vscale x 2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.fptoui.sat.nxv2i16.nxv2f32(<vscale x 2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.fptoui.sat.nxv2i16.nxv2f64(<vscale x 2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2i32.nxv2f32(<vscale x 2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2i32.nxv2f64(<vscale x 2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.fptoui.sat.nxv2i64.nxv2f32(<vscale x 2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.fptoui.sat.nxv2i64.nxv2f64(<vscale x 2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.fptoui.sat.nxv2i1.nxv2f32(<vscale x 2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.fptoui.sat.nxv2i1.nxv2f64(<vscale x 2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.fptoui.sat.nxv4i8.nxv4f32(<vscale x 4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.fptoui.sat.nxv4i8.nxv4f64(<vscale x 4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4i16.nxv4f32(<vscale x 4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4i16.nxv4f64(<vscale x 4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.fptoui.sat.nxv4i32.nxv4f32(<vscale x 4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.fptoui.sat.nxv4i32.nxv4f64(<vscale x 4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4i64.nxv4f32(<vscale x 4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4i64.nxv4f64(<vscale x 4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.fptoui.sat.nxv4i1.nxv4f32(<vscale x 4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.fptoui.sat.nxv4i1.nxv4f64(<vscale x 4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.fptoui.sat.nxv8i8.nxv8f32(<vscale x 8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.fptoui.sat.nxv8i8.nxv8f64(<vscale x 8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.fptoui.sat.nxv8i16.nxv8f32(<vscale x 8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.fptoui.sat.nxv8i16.nxv8f64(<vscale x 8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.fptoui.sat.nxv8i32.nxv8f32(<vscale x 8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.fptoui.sat.nxv8i32.nxv8f64(<vscale x 8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.fptoui.sat.nxv8i64.nxv8f32(<vscale x 8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.fptoui.sat.nxv8i64.nxv8f64(<vscale x 8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.fptoui.sat.nxv8i1.nxv8f32(<vscale x 8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.fptoui.sat.nxv8i1.nxv8f64(<vscale x 8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.fptoui.sat.nxv16i8.nxv16f32(<vscale x 16 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.fptoui.sat.nxv16i8.nxv16f64(<vscale x 16 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.fptoui.sat.nxv16i16.nxv16f32(<vscale x 16 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.fptoui.sat.nxv16i16.nxv16f64(<vscale x 16 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.fptoui.sat.nxv16i32.nxv16f32(<vscale x 16 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.fptoui.sat.nxv16i32.nxv16f64(<vscale x 16 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.fptoui.sat.nxv16i64.nxv16f32(<vscale x 16 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.fptoui.sat.nxv16i64.nxv16f64(<vscale x 16 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.fptoui.sat.nxv16i1.nxv16f32(<vscale x 16 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.fptoui.sat.nxv16i1.nxv16f64(<vscale x 16 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v1f32_v1i8 = call <1 x i8> @llvm.fptoui.sat.v1i8.v1f32(<1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v1f64_v1i8 = call <1 x i8> @llvm.fptoui.sat.v1i8.v1f64(<1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f32_v1i16 = call <1 x i16> @llvm.fptoui.sat.v1i16.v1f32(<1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v1f64_v1i16 = call <1 x i16> @llvm.fptoui.sat.v1i16.v1f64(<1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f32_v1i32 = call <1 x i32> @llvm.fptoui.sat.v1i32.v1f32(<1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f64_v1i32 = call <1 x i32> @llvm.fptoui.sat.v1i32.v1f64(<1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f32_v1i64 = call <1 x i64> @llvm.fptoui.sat.v1i64.v1f32(<1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f64_v1i64 = call <1 x i64> @llvm.fptoui.sat.v1i64.v1f64(<1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v1f32_v1i1 = call <1 x i1> @llvm.fptoui.sat.v1i1.v1f32(<1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v1f64_v1i1 = call <1 x i1> @llvm.fptoui.sat.v1i1.v1f64(<1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32_v2i8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v2f64_v2i8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f32_v2i16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f64_v2i16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f32_v2i32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f64_v2i32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f32_v2i64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f64_v2i64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2f32_v2i1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2f64_v2i1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32_v4i8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4f64_v4i8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4f32_v4i16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64_v4i16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4f32_v4i32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f64_v4i32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f32_v4i64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4f64_v4i64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4f32_v4i1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4f64_v4i1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32_v8i8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v8f64_v8i8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8f32_v8i16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v8f64_v8i16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8f32_v8i32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v8f64_v8i32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v8f32_v8i64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v8f64_v8i64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8f32_v8i1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v8f64_v8i1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.fptoui.sat.nxv1i8.nxv1f32(<vscale x 1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.fptoui.sat.nxv1i8.nxv1f64(<vscale x 1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.fptoui.sat.nxv1i16.nxv1f32(<vscale x 1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.fptoui.sat.nxv1i16.nxv1f64(<vscale x 1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.fptoui.sat.nxv1i32.nxv1f32(<vscale x 1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.fptoui.sat.nxv1i32.nxv1f64(<vscale x 1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.fptoui.sat.nxv1i64.nxv1f32(<vscale x 1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.fptoui.sat.nxv1i64.nxv1f64(<vscale x 1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.fptoui.sat.nxv1i1.nxv1f32(<vscale x 1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.fptoui.sat.nxv1i1.nxv1f64(<vscale x 1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.fptoui.sat.nxv2i8.nxv2f32(<vscale x 2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.fptoui.sat.nxv2i8.nxv2f64(<vscale x 2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.fptoui.sat.nxv2i16.nxv2f32(<vscale x 2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.fptoui.sat.nxv2i16.nxv2f64(<vscale x 2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2i32.nxv2f32(<vscale x 2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2i32.nxv2f64(<vscale x 2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.fptoui.sat.nxv2i64.nxv2f32(<vscale x 2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.fptoui.sat.nxv2i64.nxv2f64(<vscale x 2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.fptoui.sat.nxv2i1.nxv2f32(<vscale x 2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.fptoui.sat.nxv2i1.nxv2f64(<vscale x 2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.fptoui.sat.nxv4i8.nxv4f32(<vscale x 4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.fptoui.sat.nxv4i8.nxv4f64(<vscale x 4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4i16.nxv4f32(<vscale x 4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4i16.nxv4f64(<vscale x 4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.fptoui.sat.nxv4i32.nxv4f32(<vscale x 4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.fptoui.sat.nxv4i32.nxv4f64(<vscale x 4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4i64.nxv4f32(<vscale x 4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4i64.nxv4f64(<vscale x 4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.fptoui.sat.nxv4i1.nxv4f32(<vscale x 4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.fptoui.sat.nxv4i1.nxv4f64(<vscale x 4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.fptoui.sat.nxv8i8.nxv8f32(<vscale x 8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.fptoui.sat.nxv8i8.nxv8f64(<vscale x 8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.fptoui.sat.nxv8i16.nxv8f32(<vscale x 8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.fptoui.sat.nxv8i16.nxv8f64(<vscale x 8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.fptoui.sat.nxv8i32.nxv8f32(<vscale x 8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.fptoui.sat.nxv8i32.nxv8f64(<vscale x 8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.fptoui.sat.nxv8i64.nxv8f32(<vscale x 8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.fptoui.sat.nxv8i64.nxv8f64(<vscale x 8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.fptoui.sat.nxv8i1.nxv8f32(<vscale x 8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.fptoui.sat.nxv8i1.nxv8f64(<vscale x 8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.fptoui.sat.nxv16i8.nxv16f32(<vscale x 16 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.fptoui.sat.nxv16i8.nxv16f64(<vscale x 16 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.fptoui.sat.nxv16i16.nxv16f32(<vscale x 16 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.fptoui.sat.nxv16i16.nxv16f64(<vscale x 16 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.fptoui.sat.nxv16i32.nxv16f32(<vscale x 16 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.fptoui.sat.nxv16i32.nxv16f64(<vscale x 16 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.fptoui.sat.nxv16i64.nxv16f32(<vscale x 16 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.fptoui.sat.nxv16i64.nxv16f64(<vscale x 16 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.fptoui.sat.nxv16i1.nxv16f32(<vscale x 16 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.fptoui.sat.nxv16i1.nxv16f64(<vscale x 16 x double> poison)
; RV64V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%v1f32_v1i8 = call <1 x i8> @llvm.fptoui.sat.v1i8.v1f32(<1 x float> poison)
@@ -293,189 +295,189 @@ define void @fptoui_sat() {
define void @fptosi_sat() {
; RV64ZVE32F-LABEL: 'fptosi_sat'
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i8 = call <1 x i8> @llvm.fptosi.sat.v1i8.v1f32(<1 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i8 = call <1 x i8> @llvm.fptosi.sat.v1i8.v1f64(<1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i16 = call <1 x i16> @llvm.fptosi.sat.v1i16.v1f32(<1 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i16 = call <1 x i16> @llvm.fptosi.sat.v1i16.v1f64(<1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i32 = call <1 x i32> @llvm.fptosi.sat.v1i32.v1f32(<1 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i32 = call <1 x i32> @llvm.fptosi.sat.v1i32.v1f64(<1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i64 = call <1 x i64> @llvm.fptosi.sat.v1i64.v1f32(<1 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i64 = call <1 x i64> @llvm.fptosi.sat.v1i64.v1f64(<1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i1 = call <1 x i1> @llvm.fptosi.sat.v1i1.v1f32(<1 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i1 = call <1 x i1> @llvm.fptosi.sat.v1i1.v1f64(<1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32_v2i64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64_v2i64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32_v4i64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64_v4i64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32_v8i64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f64_v8i64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.fptosi.sat.nxv1i8.nxv1f32(<vscale x 1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v1f32_v1i8 = call <1 x i8> @llvm.fptosi.sat.v1i8.v1f32(<1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f64_v1i8 = call <1 x i8> @llvm.fptosi.sat.v1i8.v1f64(<1 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f32_v1i16 = call <1 x i16> @llvm.fptosi.sat.v1i16.v1f32(<1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f64_v1i16 = call <1 x i16> @llvm.fptosi.sat.v1i16.v1f64(<1 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f32_v1i32 = call <1 x i32> @llvm.fptosi.sat.v1i32.v1f32(<1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f64_v1i32 = call <1 x i32> @llvm.fptosi.sat.v1i32.v1f64(<1 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f32_v1i64 = call <1 x i64> @llvm.fptosi.sat.v1i64.v1f32(<1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1f64_v1i64 = call <1 x i64> @llvm.fptosi.sat.v1i64.v1f64(<1 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v1f32_v1i1 = call <1 x i1> @llvm.fptosi.sat.v1i1.v1f32(<1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v1f64_v1i1 = call <1 x i1> @llvm.fptosi.sat.v1i1.v1f64(<1 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32_v2i8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f64_v2i8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f32_v2i16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f64_v2i16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f32_v2i32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f64_v2i32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f32_v2i64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f64_v2i64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2f32_v2i1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2f64_v2i1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32_v4i8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4f64_v4i8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4f32_v4i16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4f64_v4i16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4f32_v4i32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4f64_v4i32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v4f32_v4i64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4f64_v4i64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4f32_v4i1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v4f64_v4i1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32_v8i8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8f64_v8i8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8f32_v8i16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8f64_v8i16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8f32_v8i32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %v8f64_v8i32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v8f32_v8i64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v8f64_v8i64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8f32_v8i1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %v8f64_v8i1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.fptosi.sat.nxv1i8.nxv1f32(<vscale x 1 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.fptosi.sat.nxv1i8.nxv1f64(<vscale x 1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.fptosi.sat.nxv1i16.nxv1f32(<vscale x 1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.fptosi.sat.nxv1i16.nxv1f32(<vscale x 1 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.fptosi.sat.nxv1i16.nxv1f64(<vscale x 1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.fptosi.sat.nxv1i32.nxv1f32(<vscale x 1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.fptosi.sat.nxv1i32.nxv1f32(<vscale x 1 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.fptosi.sat.nxv1i32.nxv1f64(<vscale x 1 x double> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.fptosi.sat.nxv1i64.nxv1f32(<vscale x 1 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.fptosi.sat.nxv1i64.nxv1f64(<vscale x 1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.fptosi.sat.nxv1i1.nxv1f32(<vscale x 1 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.fptosi.sat.nxv1i1.nxv1f32(<vscale x 1 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.fptosi.sat.nxv1i1.nxv1f64(<vscale x 1 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.fptosi.sat.nxv2i8.nxv2f32(<vscale x 2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.fptosi.sat.nxv2i8.nxv2f32(<vscale x 2 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.fptosi.sat.nxv2i8.nxv2f64(<vscale x 2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.fptosi.sat.nxv2i16.nxv2f32(<vscale x 2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.fptosi.sat.nxv2i16.nxv2f32(<vscale x 2 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.fptosi.sat.nxv2i16.nxv2f64(<vscale x 2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2i32.nxv2f32(<vscale x 2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2i32.nxv2f32(<vscale x 2 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2i32.nxv2f64(<vscale x 2 x double> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2i64.nxv2f32(<vscale x 2 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2i64.nxv2f64(<vscale x 2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.fptosi.sat.nxv2i1.nxv2f32(<vscale x 2 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.fptosi.sat.nxv2i1.nxv2f32(<vscale x 2 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.fptosi.sat.nxv2i1.nxv2f64(<vscale x 2 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.fptosi.sat.nxv4i8.nxv4f32(<vscale x 4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.fptosi.sat.nxv4i8.nxv4f32(<vscale x 4 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.fptosi.sat.nxv4i8.nxv4f64(<vscale x 4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4i16.nxv4f32(<vscale x 4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4i16.nxv4f32(<vscale x 4 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4i16.nxv4f64(<vscale x 4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4i32.nxv4f32(<vscale x 4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4i32.nxv4f32(<vscale x 4 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4i32.nxv4f64(<vscale x 4 x double> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4i64.nxv4f32(<vscale x 4 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4i64.nxv4f64(<vscale x 4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.fptosi.sat.nxv4i1.nxv4f32(<vscale x 4 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.fptosi.sat.nxv4i1.nxv4f32(<vscale x 4 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.fptosi.sat.nxv4i1.nxv4f64(<vscale x 4 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.fptosi.sat.nxv8i8.nxv8f32(<vscale x 8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.fptosi.sat.nxv8i8.nxv8f32(<vscale x 8 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.fptosi.sat.nxv8i8.nxv8f64(<vscale x 8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8i16.nxv8f32(<vscale x 8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8i16.nxv8f32(<vscale x 8 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8i16.nxv8f64(<vscale x 8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8i32.nxv8f32(<vscale x 8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8i32.nxv8f32(<vscale x 8 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8i32.nxv8f64(<vscale x 8 x double> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.fptosi.sat.nxv8i64.nxv8f32(<vscale x 8 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.fptosi.sat.nxv8i64.nxv8f64(<vscale x 8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.fptosi.sat.nxv8i1.nxv8f32(<vscale x 8 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.fptosi.sat.nxv8i1.nxv8f32(<vscale x 8 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.fptosi.sat.nxv8i1.nxv8f64(<vscale x 8 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.fptosi.sat.nxv16i8.nxv16f32(<vscale x 16 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.fptosi.sat.nxv16i8.nxv16f32(<vscale x 16 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.fptosi.sat.nxv16i8.nxv16f64(<vscale x 16 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.fptosi.sat.nxv16i16.nxv16f32(<vscale x 16 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.fptosi.sat.nxv16i16.nxv16f32(<vscale x 16 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.fptosi.sat.nxv16i16.nxv16f64(<vscale x 16 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.fptosi.sat.nxv16i32.nxv16f32(<vscale x 16 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.fptosi.sat.nxv16i32.nxv16f32(<vscale x 16 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.fptosi.sat.nxv16i32.nxv16f64(<vscale x 16 x double> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.fptosi.sat.nxv16i64.nxv16f32(<vscale x 16 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.fptosi.sat.nxv16i64.nxv16f64(<vscale x 16 x double> poison)
-; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.fptosi.sat.nxv16i1.nxv16f32(<vscale x 16 x float> poison)
+; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.fptosi.sat.nxv16i1.nxv16f32(<vscale x 16 x float> poison)
; RV64ZVE32F-NEXT: Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.fptosi.sat.nxv16i1.nxv16f64(<vscale x 16 x double> poison)
; RV64ZVE32F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; RV64V-LABEL: 'fptosi_sat'
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i8 = call <1 x i8> @llvm.fptosi.sat.v1i8.v1f32(<1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i8 = call <1 x i8> @llvm.fptosi.sat.v1i8.v1f64(<1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i16 = call <1 x i16> @llvm.fptosi.sat.v1i16.v1f32(<1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i16 = call <1 x i16> @llvm.fptosi.sat.v1i16.v1f64(<1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i32 = call <1 x i32> @llvm.fptosi.sat.v1i32.v1f32(<1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i32 = call <1 x i32> @llvm.fptosi.sat.v1i32.v1f64(<1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i64 = call <1 x i64> @llvm.fptosi.sat.v1i64.v1f32(<1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i64 = call <1 x i64> @llvm.fptosi.sat.v1i64.v1f64(<1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f32_v1i1 = call <1 x i1> @llvm.fptosi.sat.v1i1.v1f32(<1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1f64_v1i1 = call <1 x i1> @llvm.fptosi.sat.v1i1.v1f64(<1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.fptosi.sat.nxv1i8.nxv1f32(<vscale x 1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.fptosi.sat.nxv1i8.nxv1f64(<vscale x 1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.fptosi.sat.nxv1i16.nxv1f32(<vscale x 1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.fptosi.sat.nxv1i16.nxv1f64(<vscale x 1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.fptosi.sat.nxv1i32.nxv1f32(<vscale x 1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.fptosi.sat.nxv1i32.nxv1f64(<vscale x 1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.fptosi.sat.nxv1i64.nxv1f32(<vscale x 1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.fptosi.sat.nxv1i64.nxv1f64(<vscale x 1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.fptosi.sat.nxv1i1.nxv1f32(<vscale x 1 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.fptosi.sat.nxv1i1.nxv1f64(<vscale x 1 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.fptosi.sat.nxv2i8.nxv2f32(<vscale x 2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.fptosi.sat.nxv2i8.nxv2f64(<vscale x 2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.fptosi.sat.nxv2i16.nxv2f32(<vscale x 2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.fptosi.sat.nxv2i16.nxv2f64(<vscale x 2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2i32.nxv2f32(<vscale x 2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2i32.nxv2f64(<vscale x 2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2i64.nxv2f32(<vscale x 2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2i64.nxv2f64(<vscale x 2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.fptosi.sat.nxv2i1.nxv2f32(<vscale x 2 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.fptosi.sat.nxv2i1.nxv2f64(<vscale x 2 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.fptosi.sat.nxv4i8.nxv4f32(<vscale x 4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.fptosi.sat.nxv4i8.nxv4f64(<vscale x 4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4i16.nxv4f32(<vscale x 4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4i16.nxv4f64(<vscale x 4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4i32.nxv4f32(<vscale x 4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4i32.nxv4f64(<vscale x 4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4i64.nxv4f32(<vscale x 4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4i64.nxv4f64(<vscale x 4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.fptosi.sat.nxv4i1.nxv4f32(<vscale x 4 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.fptosi.sat.nxv4i1.nxv4f64(<vscale x 4 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.fptosi.sat.nxv8i8.nxv8f32(<vscale x 8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.fptosi.sat.nxv8i8.nxv8f64(<vscale x 8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8i16.nxv8f32(<vscale x 8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8i16.nxv8f64(<vscale x 8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8i32.nxv8f32(<vscale x 8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8i32.nxv8f64(<vscale x 8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.fptosi.sat.nxv8i64.nxv8f32(<vscale x 8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.fptosi.sat.nxv8i64.nxv8f64(<vscale x 8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.fptosi.sat.nxv8i1.nxv8f32(<vscale x 8 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.fptosi.sat.nxv8i1.nxv8f64(<vscale x 8 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.fptosi.sat.nxv16i8.nxv16f32(<vscale x 16 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.fptosi.sat.nxv16i8.nxv16f64(<vscale x 16 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.fptosi.sat.nxv16i16.nxv16f32(<vscale x 16 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.fptosi.sat.nxv16i16.nxv16f64(<vscale x 16 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.fptosi.sat.nxv16i32.nxv16f32(<vscale x 16 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.fptosi.sat.nxv16i32.nxv16f64(<vscale x 16 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.fptosi.sat.nxv16i64.nxv16f32(<vscale x 16 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.fptosi.sat.nxv16i64.nxv16f64(<vscale x 16 x double> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.fptosi.sat.nxv16i1.nxv16f32(<vscale x 16 x float> poison)
-; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.fptosi.sat.nxv16i1.nxv16f64(<vscale x 16 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v1f32_v1i8 = call <1 x i8> @llvm.fptosi.sat.v1i8.v1f32(<1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v1f64_v1i8 = call <1 x i8> @llvm.fptosi.sat.v1i8.v1f64(<1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f32_v1i16 = call <1 x i16> @llvm.fptosi.sat.v1i16.v1f32(<1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v1f64_v1i16 = call <1 x i16> @llvm.fptosi.sat.v1i16.v1f64(<1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f32_v1i32 = call <1 x i32> @llvm.fptosi.sat.v1i32.v1f32(<1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f64_v1i32 = call <1 x i32> @llvm.fptosi.sat.v1i32.v1f64(<1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f32_v1i64 = call <1 x i64> @llvm.fptosi.sat.v1i64.v1f32(<1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v1f64_v1i64 = call <1 x i64> @llvm.fptosi.sat.v1i64.v1f64(<1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v1f32_v1i1 = call <1 x i1> @llvm.fptosi.sat.v1i1.v1f32(<1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v1f64_v1i1 = call <1 x i1> @llvm.fptosi.sat.v1i1.v1f64(<1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32_v2i8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v2f64_v2i8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f32_v2i16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f64_v2i16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f32_v2i32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f64_v2i32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f32_v2i64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f64_v2i64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2f32_v2i1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2f64_v2i1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32_v4i8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4f64_v4i8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4f32_v4i16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64_v4i16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4f32_v4i32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f64_v4i32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f32_v4i64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4f64_v4i64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4f32_v4i1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4f64_v4i1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32_v8i8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v8f64_v8i8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8f32_v8i16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v8f64_v8i16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8f32_v8i32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v8f64_v8i32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v8f32_v8i64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v8f64_v8i64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8f32_v8i1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v8f64_v8i1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.fptosi.sat.nxv1i8.nxv1f32(<vscale x 1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.fptosi.sat.nxv1i8.nxv1f64(<vscale x 1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.fptosi.sat.nxv1i16.nxv1f32(<vscale x 1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.fptosi.sat.nxv1i16.nxv1f64(<vscale x 1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.fptosi.sat.nxv1i32.nxv1f32(<vscale x 1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.fptosi.sat.nxv1i32.nxv1f64(<vscale x 1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.fptosi.sat.nxv1i64.nxv1f32(<vscale x 1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.fptosi.sat.nxv1i64.nxv1f64(<vscale x 1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.fptosi.sat.nxv1i1.nxv1f32(<vscale x 1 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.fptosi.sat.nxv1i1.nxv1f64(<vscale x 1 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.fptosi.sat.nxv2i8.nxv2f32(<vscale x 2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.fptosi.sat.nxv2i8.nxv2f64(<vscale x 2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.fptosi.sat.nxv2i16.nxv2f32(<vscale x 2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.fptosi.sat.nxv2i16.nxv2f64(<vscale x 2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2i32.nxv2f32(<vscale x 2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2i32.nxv2f64(<vscale x 2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2i64.nxv2f32(<vscale x 2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2i64.nxv2f64(<vscale x 2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.fptosi.sat.nxv2i1.nxv2f32(<vscale x 2 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.fptosi.sat.nxv2i1.nxv2f64(<vscale x 2 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.fptosi.sat.nxv4i8.nxv4f32(<vscale x 4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.fptosi.sat.nxv4i8.nxv4f64(<vscale x 4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4i16.nxv4f32(<vscale x 4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4i16.nxv4f64(<vscale x 4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4i32.nxv4f32(<vscale x 4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4i32.nxv4f64(<vscale x 4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4i64.nxv4f32(<vscale x 4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4i64.nxv4f64(<vscale x 4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.fptosi.sat.nxv4i1.nxv4f32(<vscale x 4 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.fptosi.sat.nxv4i1.nxv4f64(<vscale x 4 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.fptosi.sat.nxv8i8.nxv8f32(<vscale x 8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.fptosi.sat.nxv8i8.nxv8f64(<vscale x 8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8i16.nxv8f32(<vscale x 8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8i16.nxv8f64(<vscale x 8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8i32.nxv8f32(<vscale x 8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8i32.nxv8f64(<vscale x 8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.fptosi.sat.nxv8i64.nxv8f32(<vscale x 8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.fptosi.sat.nxv8i64.nxv8f64(<vscale x 8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.fptosi.sat.nxv8i1.nxv8f32(<vscale x 8 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.fptosi.sat.nxv8i1.nxv8f64(<vscale x 8 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.fptosi.sat.nxv16i8.nxv16f32(<vscale x 16 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.fptosi.sat.nxv16i8.nxv16f64(<vscale x 16 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.fptosi.sat.nxv16i16.nxv16f32(<vscale x 16 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.fptosi.sat.nxv16i16.nxv16f64(<vscale x 16 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.fptosi.sat.nxv16i32.nxv16f32(<vscale x 16 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.fptosi.sat.nxv16i32.nxv16f64(<vscale x 16 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.fptosi.sat.nxv16i64.nxv16f32(<vscale x 16 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.fptosi.sat.nxv16i64.nxv16f64(<vscale x 16 x double> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.fptosi.sat.nxv16i1.nxv16f32(<vscale x 16 x float> poison)
+; RV64V-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.fptosi.sat.nxv16i1.nxv16f64(<vscale x 16 x double> poison)
; RV64V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%v1f32_v1i8 = call <1 x i8> @llvm.fptosi.sat.v1i8.v1f32(<1 x float> poison)
@@ -579,5 +581,3 @@ define void @fptosi_sat() {
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
index ece528d..e3305c0 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
; Check getShuffleCost for scalable vector
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+m,+v < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+m,+v < %s | FileCheck %s --check-prefixes=CHECK,ARGBASED
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+v -intrinsic-cost-strategy=type-based-intrinsic-cost < %s | FileCheck %s --check-prefixes=CHECK,TYPEBASED
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+m,+v -cost-kind=code-size < %s | FileCheck %s --check-prefix=SIZE
define void @vector_broadcast() {
@@ -51,12 +52,19 @@ define void @vector_broadcast() {
}
define void @vector_insert_extract(<vscale x 4 x i32> %v0, <vscale x 16 x i32> %v1, <16 x i32> %v2) {
-; CHECK-LABEL: 'vector_insert_extract'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ARGBASED-LABEL: 'vector_insert_extract'
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPEBASED-LABEL: 'vector_insert_extract'
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SIZE-LABEL: 'vector_insert_extract'
; SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
@@ -140,22 +148,39 @@ define void @vector_reverse() {
}
define void @vector_splice() {
-; CHECK-LABEL: 'vector_splice'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ARGBASED-LABEL: 'vector_splice'
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPEBASED-LABEL: 'vector_splice'
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SIZE-LABEL: 'vector_splice'
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index 4bb4818..71746ca 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -1873,155 +1873,80 @@ define void @is.fpclass() {
}
define void @reverse() {
-; ARGBASED-LABEL: 'reverse'
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %1 = call <2 x i1> @llvm.experimental.vp.reverse.v2i1(<2 x i1> poison, <2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %2 = call <4 x i1> @llvm.experimental.vp.reverse.v4i1(<4 x i1> poison, <4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %3 = call <8 x i1> @llvm.experimental.vp.reverse.v8i1(<8 x i1> poison, <8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %4 = call <16 x i1> @llvm.experimental.vp.reverse.v16i1(<16 x i1> poison, <16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %5 = call <2 x i8> @llvm.experimental.vp.reverse.v2i8(<2 x i8> poison, <2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call <4 x i8> @llvm.experimental.vp.reverse.v4i8(<4 x i8> poison, <4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call <8 x i8> @llvm.experimental.vp.reverse.v8i8(<8 x i8> poison, <8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %8 = call <16 x i8> @llvm.experimental.vp.reverse.v16i8(<16 x i8> poison, <16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %9 = call <2 x i16> @llvm.experimental.vp.reverse.v2i16(<2 x i16> poison, <2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %10 = call <4 x i16> @llvm.experimental.vp.reverse.v4i16(<4 x i16> poison, <4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = call <8 x i16> @llvm.experimental.vp.reverse.v8i16(<8 x i16> poison, <8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %12 = call <16 x i16> @llvm.experimental.vp.reverse.v16i16(<16 x i16> poison, <16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %13 = call <2 x i32> @llvm.experimental.vp.reverse.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %14 = call <4 x i32> @llvm.experimental.vp.reverse.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %15 = call <8 x i32> @llvm.experimental.vp.reverse.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %16 = call <16 x i32> @llvm.experimental.vp.reverse.v16i32(<16 x i32> poison, <16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %17 = call <2 x i64> @llvm.experimental.vp.reverse.v2i64(<2 x i64> poison, <2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %18 = call <4 x i64> @llvm.experimental.vp.reverse.v4i64(<4 x i64> poison, <4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %19 = call <8 x i64> @llvm.experimental.vp.reverse.v8i64(<8 x i64> poison, <8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %20 = call <16 x i64> @llvm.experimental.vp.reverse.v16i64(<16 x i64> poison, <16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %21 = call <2 x bfloat> @llvm.experimental.vp.reverse.v2bf16(<2 x bfloat> poison, <2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %22 = call <4 x bfloat> @llvm.experimental.vp.reverse.v4bf16(<4 x bfloat> poison, <4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %23 = call <8 x bfloat> @llvm.experimental.vp.reverse.v8bf16(<8 x bfloat> poison, <8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %24 = call <16 x bfloat> @llvm.experimental.vp.reverse.v16bf16(<16 x bfloat> poison, <16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %25 = call <2 x half> @llvm.experimental.vp.reverse.v2f16(<2 x half> poison, <2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %26 = call <4 x half> @llvm.experimental.vp.reverse.v4f16(<4 x half> poison, <4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %27 = call <8 x half> @llvm.experimental.vp.reverse.v8f16(<8 x half> poison, <8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %28 = call <16 x half> @llvm.experimental.vp.reverse.v16f16(<16 x half> poison, <16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %29 = call <2 x float> @llvm.experimental.vp.reverse.v2f32(<2 x float> poison, <2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %30 = call <4 x float> @llvm.experimental.vp.reverse.v4f32(<4 x float> poison, <4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %31 = call <8 x float> @llvm.experimental.vp.reverse.v8f32(<8 x float> poison, <8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %32 = call <16 x float> @llvm.experimental.vp.reverse.v16f32(<16 x float> poison, <16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %33 = call <2 x double> @llvm.experimental.vp.reverse.v2f64(<2 x double> poison, <2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %34 = call <4 x double> @llvm.experimental.vp.reverse.v4f64(<4 x double> poison, <4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %35 = call <8 x double> @llvm.experimental.vp.reverse.v8f64(<8 x double> poison, <8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %36 = call <16 x double> @llvm.experimental.vp.reverse.v16f64(<16 x double> poison, <16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %37 = call <vscale x 2 x i1> @llvm.experimental.vp.reverse.nxv2i1(<vscale x 2 x i1> poison, <vscale x 2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %38 = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> poison, <vscale x 4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %39 = call <vscale x 8 x i1> @llvm.experimental.vp.reverse.nxv8i1(<vscale x 8 x i1> poison, <vscale x 8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %40 = call <vscale x 16 x i1> @llvm.experimental.vp.reverse.nxv16i1(<vscale x 16 x i1> poison, <vscale x 16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %41 = call <vscale x 2 x i8> @llvm.experimental.vp.reverse.nxv2i8(<vscale x 2 x i8> poison, <vscale x 2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %42 = call <vscale x 4 x i8> @llvm.experimental.vp.reverse.nxv4i8(<vscale x 4 x i8> poison, <vscale x 4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %43 = call <vscale x 8 x i8> @llvm.experimental.vp.reverse.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %44 = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %45 = call <vscale x 2 x i16> @llvm.experimental.vp.reverse.nxv2i16(<vscale x 2 x i16> poison, <vscale x 2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %46 = call <vscale x 4 x i16> @llvm.experimental.vp.reverse.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %47 = call <vscale x 8 x i16> @llvm.experimental.vp.reverse.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %48 = call <vscale x 16 x i16> @llvm.experimental.vp.reverse.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %49 = call <vscale x 2 x i32> @llvm.experimental.vp.reverse.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %50 = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %51 = call <vscale x 8 x i32> @llvm.experimental.vp.reverse.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %52 = call <vscale x 16 x i32> @llvm.experimental.vp.reverse.nxv16i32(<vscale x 16 x i32> poison, <vscale x 16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %53 = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %54 = call <vscale x 4 x i64> @llvm.experimental.vp.reverse.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %55 = call <vscale x 8 x i64> @llvm.experimental.vp.reverse.nxv8i64(<vscale x 8 x i64> poison, <vscale x 8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %56 = call <vscale x 16 x i64> @llvm.experimental.vp.reverse.nxv16i64(<vscale x 16 x i64> poison, <vscale x 16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %57 = call <vscale x 2 x bfloat> @llvm.experimental.vp.reverse.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %58 = call <vscale x 4 x bfloat> @llvm.experimental.vp.reverse.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %59 = call <vscale x 8 x bfloat> @llvm.experimental.vp.reverse.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %60 = call <vscale x 16 x bfloat> @llvm.experimental.vp.reverse.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %61 = call <vscale x 2 x half> @llvm.experimental.vp.reverse.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %62 = call <vscale x 4 x half> @llvm.experimental.vp.reverse.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %63 = call <vscale x 8 x half> @llvm.experimental.vp.reverse.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %64 = call <vscale x 16 x half> @llvm.experimental.vp.reverse.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %65 = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %66 = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %67 = call <vscale x 8 x float> @llvm.experimental.vp.reverse.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %68 = call <vscale x 16 x float> @llvm.experimental.vp.reverse.nxv16f32(<vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %69 = call <vscale x 2 x double> @llvm.experimental.vp.reverse.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %70 = call <vscale x 4 x double> @llvm.experimental.vp.reverse.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %71 = call <vscale x 8 x double> @llvm.experimental.vp.reverse.nxv8f64(<vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %72 = call <vscale x 16 x double> @llvm.experimental.vp.reverse.nxv16f64(<vscale x 16 x double> poison, <vscale x 16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'reverse'
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %1 = call <2 x i1> @llvm.experimental.vp.reverse.v2i1(<2 x i1> poison, <2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %2 = call <4 x i1> @llvm.experimental.vp.reverse.v4i1(<4 x i1> poison, <4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %3 = call <8 x i1> @llvm.experimental.vp.reverse.v8i1(<8 x i1> poison, <8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %4 = call <16 x i1> @llvm.experimental.vp.reverse.v16i1(<16 x i1> poison, <16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %5 = call <2 x i8> @llvm.experimental.vp.reverse.v2i8(<2 x i8> poison, <2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %6 = call <4 x i8> @llvm.experimental.vp.reverse.v4i8(<4 x i8> poison, <4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %7 = call <8 x i8> @llvm.experimental.vp.reverse.v8i8(<8 x i8> poison, <8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %8 = call <16 x i8> @llvm.experimental.vp.reverse.v16i8(<16 x i8> poison, <16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %9 = call <2 x i16> @llvm.experimental.vp.reverse.v2i16(<2 x i16> poison, <2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %10 = call <4 x i16> @llvm.experimental.vp.reverse.v4i16(<4 x i16> poison, <4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %11 = call <8 x i16> @llvm.experimental.vp.reverse.v8i16(<8 x i16> poison, <8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %12 = call <16 x i16> @llvm.experimental.vp.reverse.v16i16(<16 x i16> poison, <16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %13 = call <2 x i32> @llvm.experimental.vp.reverse.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %14 = call <4 x i32> @llvm.experimental.vp.reverse.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %15 = call <8 x i32> @llvm.experimental.vp.reverse.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %16 = call <16 x i32> @llvm.experimental.vp.reverse.v16i32(<16 x i32> poison, <16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %17 = call <2 x i64> @llvm.experimental.vp.reverse.v2i64(<2 x i64> poison, <2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %18 = call <4 x i64> @llvm.experimental.vp.reverse.v4i64(<4 x i64> poison, <4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %19 = call <8 x i64> @llvm.experimental.vp.reverse.v8i64(<8 x i64> poison, <8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %20 = call <16 x i64> @llvm.experimental.vp.reverse.v16i64(<16 x i64> poison, <16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %21 = call <2 x bfloat> @llvm.experimental.vp.reverse.v2bf16(<2 x bfloat> poison, <2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %22 = call <4 x bfloat> @llvm.experimental.vp.reverse.v4bf16(<4 x bfloat> poison, <4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %23 = call <8 x bfloat> @llvm.experimental.vp.reverse.v8bf16(<8 x bfloat> poison, <8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %24 = call <16 x bfloat> @llvm.experimental.vp.reverse.v16bf16(<16 x bfloat> poison, <16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %25 = call <2 x half> @llvm.experimental.vp.reverse.v2f16(<2 x half> poison, <2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %26 = call <4 x half> @llvm.experimental.vp.reverse.v4f16(<4 x half> poison, <4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %27 = call <8 x half> @llvm.experimental.vp.reverse.v8f16(<8 x half> poison, <8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %28 = call <16 x half> @llvm.experimental.vp.reverse.v16f16(<16 x half> poison, <16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %29 = call <2 x float> @llvm.experimental.vp.reverse.v2f32(<2 x float> poison, <2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %30 = call <4 x float> @llvm.experimental.vp.reverse.v4f32(<4 x float> poison, <4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %31 = call <8 x float> @llvm.experimental.vp.reverse.v8f32(<8 x float> poison, <8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %32 = call <16 x float> @llvm.experimental.vp.reverse.v16f32(<16 x float> poison, <16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %33 = call <2 x double> @llvm.experimental.vp.reverse.v2f64(<2 x double> poison, <2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %34 = call <4 x double> @llvm.experimental.vp.reverse.v4f64(<4 x double> poison, <4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %35 = call <8 x double> @llvm.experimental.vp.reverse.v8f64(<8 x double> poison, <8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %36 = call <16 x double> @llvm.experimental.vp.reverse.v16f64(<16 x double> poison, <16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %37 = call <vscale x 2 x i1> @llvm.experimental.vp.reverse.nxv2i1(<vscale x 2 x i1> poison, <vscale x 2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %38 = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> poison, <vscale x 4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %39 = call <vscale x 8 x i1> @llvm.experimental.vp.reverse.nxv8i1(<vscale x 8 x i1> poison, <vscale x 8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %40 = call <vscale x 16 x i1> @llvm.experimental.vp.reverse.nxv16i1(<vscale x 16 x i1> poison, <vscale x 16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %41 = call <vscale x 2 x i8> @llvm.experimental.vp.reverse.nxv2i8(<vscale x 2 x i8> poison, <vscale x 2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %42 = call <vscale x 4 x i8> @llvm.experimental.vp.reverse.nxv4i8(<vscale x 4 x i8> poison, <vscale x 4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %43 = call <vscale x 8 x i8> @llvm.experimental.vp.reverse.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %44 = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %45 = call <vscale x 2 x i16> @llvm.experimental.vp.reverse.nxv2i16(<vscale x 2 x i16> poison, <vscale x 2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %46 = call <vscale x 4 x i16> @llvm.experimental.vp.reverse.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %47 = call <vscale x 8 x i16> @llvm.experimental.vp.reverse.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %48 = call <vscale x 16 x i16> @llvm.experimental.vp.reverse.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %49 = call <vscale x 2 x i32> @llvm.experimental.vp.reverse.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %50 = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %51 = call <vscale x 8 x i32> @llvm.experimental.vp.reverse.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %52 = call <vscale x 16 x i32> @llvm.experimental.vp.reverse.nxv16i32(<vscale x 16 x i32> poison, <vscale x 16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %53 = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %54 = call <vscale x 4 x i64> @llvm.experimental.vp.reverse.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %55 = call <vscale x 8 x i64> @llvm.experimental.vp.reverse.nxv8i64(<vscale x 8 x i64> poison, <vscale x 8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %56 = call <vscale x 16 x i64> @llvm.experimental.vp.reverse.nxv16i64(<vscale x 16 x i64> poison, <vscale x 16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %57 = call <vscale x 2 x bfloat> @llvm.experimental.vp.reverse.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %58 = call <vscale x 4 x bfloat> @llvm.experimental.vp.reverse.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %59 = call <vscale x 8 x bfloat> @llvm.experimental.vp.reverse.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %60 = call <vscale x 16 x bfloat> @llvm.experimental.vp.reverse.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %61 = call <vscale x 2 x half> @llvm.experimental.vp.reverse.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %62 = call <vscale x 4 x half> @llvm.experimental.vp.reverse.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %63 = call <vscale x 8 x half> @llvm.experimental.vp.reverse.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %64 = call <vscale x 16 x half> @llvm.experimental.vp.reverse.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %65 = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %66 = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %67 = call <vscale x 8 x float> @llvm.experimental.vp.reverse.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %68 = call <vscale x 16 x float> @llvm.experimental.vp.reverse.nxv16f32(<vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %69 = call <vscale x 2 x double> @llvm.experimental.vp.reverse.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %70 = call <vscale x 4 x double> @llvm.experimental.vp.reverse.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %71 = call <vscale x 8 x double> @llvm.experimental.vp.reverse.nxv8f64(<vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %72 = call <vscale x 16 x double> @llvm.experimental.vp.reverse.nxv16f64(<vscale x 16 x double> poison, <vscale x 16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'reverse'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %1 = call <2 x i1> @llvm.experimental.vp.reverse.v2i1(<2 x i1> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %2 = call <4 x i1> @llvm.experimental.vp.reverse.v4i1(<4 x i1> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %3 = call <8 x i1> @llvm.experimental.vp.reverse.v8i1(<8 x i1> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %4 = call <16 x i1> @llvm.experimental.vp.reverse.v16i1(<16 x i1> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %5 = call <2 x i8> @llvm.experimental.vp.reverse.v2i8(<2 x i8> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call <4 x i8> @llvm.experimental.vp.reverse.v4i8(<4 x i8> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call <8 x i8> @llvm.experimental.vp.reverse.v8i8(<8 x i8> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %8 = call <16 x i8> @llvm.experimental.vp.reverse.v16i8(<16 x i8> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %9 = call <2 x i16> @llvm.experimental.vp.reverse.v2i16(<2 x i16> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %10 = call <4 x i16> @llvm.experimental.vp.reverse.v4i16(<4 x i16> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = call <8 x i16> @llvm.experimental.vp.reverse.v8i16(<8 x i16> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %12 = call <16 x i16> @llvm.experimental.vp.reverse.v16i16(<16 x i16> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %13 = call <2 x i32> @llvm.experimental.vp.reverse.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %14 = call <4 x i32> @llvm.experimental.vp.reverse.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %15 = call <8 x i32> @llvm.experimental.vp.reverse.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %16 = call <16 x i32> @llvm.experimental.vp.reverse.v16i32(<16 x i32> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %17 = call <2 x i64> @llvm.experimental.vp.reverse.v2i64(<2 x i64> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %18 = call <4 x i64> @llvm.experimental.vp.reverse.v4i64(<4 x i64> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %19 = call <8 x i64> @llvm.experimental.vp.reverse.v8i64(<8 x i64> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %20 = call <16 x i64> @llvm.experimental.vp.reverse.v16i64(<16 x i64> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %21 = call <2 x bfloat> @llvm.experimental.vp.reverse.v2bf16(<2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %22 = call <4 x bfloat> @llvm.experimental.vp.reverse.v4bf16(<4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %23 = call <8 x bfloat> @llvm.experimental.vp.reverse.v8bf16(<8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %24 = call <16 x bfloat> @llvm.experimental.vp.reverse.v16bf16(<16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %25 = call <2 x half> @llvm.experimental.vp.reverse.v2f16(<2 x half> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %26 = call <4 x half> @llvm.experimental.vp.reverse.v4f16(<4 x half> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %27 = call <8 x half> @llvm.experimental.vp.reverse.v8f16(<8 x half> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %28 = call <16 x half> @llvm.experimental.vp.reverse.v16f16(<16 x half> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %29 = call <2 x float> @llvm.experimental.vp.reverse.v2f32(<2 x float> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %30 = call <4 x float> @llvm.experimental.vp.reverse.v4f32(<4 x float> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %31 = call <8 x float> @llvm.experimental.vp.reverse.v8f32(<8 x float> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %32 = call <16 x float> @llvm.experimental.vp.reverse.v16f32(<16 x float> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %33 = call <2 x double> @llvm.experimental.vp.reverse.v2f64(<2 x double> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %34 = call <4 x double> @llvm.experimental.vp.reverse.v4f64(<4 x double> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %35 = call <8 x double> @llvm.experimental.vp.reverse.v8f64(<8 x double> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %36 = call <16 x double> @llvm.experimental.vp.reverse.v16f64(<16 x double> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %37 = call <vscale x 2 x i1> @llvm.experimental.vp.reverse.nxv2i1(<vscale x 2 x i1> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %38 = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %39 = call <vscale x 8 x i1> @llvm.experimental.vp.reverse.nxv8i1(<vscale x 8 x i1> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %40 = call <vscale x 16 x i1> @llvm.experimental.vp.reverse.nxv16i1(<vscale x 16 x i1> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %41 = call <vscale x 2 x i8> @llvm.experimental.vp.reverse.nxv2i8(<vscale x 2 x i8> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %42 = call <vscale x 4 x i8> @llvm.experimental.vp.reverse.nxv4i8(<vscale x 4 x i8> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %43 = call <vscale x 8 x i8> @llvm.experimental.vp.reverse.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %44 = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %45 = call <vscale x 2 x i16> @llvm.experimental.vp.reverse.nxv2i16(<vscale x 2 x i16> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %46 = call <vscale x 4 x i16> @llvm.experimental.vp.reverse.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %47 = call <vscale x 8 x i16> @llvm.experimental.vp.reverse.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %48 = call <vscale x 16 x i16> @llvm.experimental.vp.reverse.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %49 = call <vscale x 2 x i32> @llvm.experimental.vp.reverse.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %50 = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %51 = call <vscale x 8 x i32> @llvm.experimental.vp.reverse.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %52 = call <vscale x 16 x i32> @llvm.experimental.vp.reverse.nxv16i32(<vscale x 16 x i32> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %53 = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %54 = call <vscale x 4 x i64> @llvm.experimental.vp.reverse.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %55 = call <vscale x 8 x i64> @llvm.experimental.vp.reverse.nxv8i64(<vscale x 8 x i64> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %56 = call <vscale x 16 x i64> @llvm.experimental.vp.reverse.nxv16i64(<vscale x 16 x i64> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %57 = call <vscale x 2 x bfloat> @llvm.experimental.vp.reverse.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %58 = call <vscale x 4 x bfloat> @llvm.experimental.vp.reverse.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %59 = call <vscale x 8 x bfloat> @llvm.experimental.vp.reverse.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %60 = call <vscale x 16 x bfloat> @llvm.experimental.vp.reverse.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %61 = call <vscale x 2 x half> @llvm.experimental.vp.reverse.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %62 = call <vscale x 4 x half> @llvm.experimental.vp.reverse.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %63 = call <vscale x 8 x half> @llvm.experimental.vp.reverse.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %64 = call <vscale x 16 x half> @llvm.experimental.vp.reverse.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %65 = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %66 = call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %67 = call <vscale x 8 x float> @llvm.experimental.vp.reverse.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %68 = call <vscale x 16 x float> @llvm.experimental.vp.reverse.nxv16f32(<vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %69 = call <vscale x 2 x double> @llvm.experimental.vp.reverse.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %70 = call <vscale x 4 x double> @llvm.experimental.vp.reverse.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %71 = call <vscale x 8 x double> @llvm.experimental.vp.reverse.nxv8f64(<vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %72 = call <vscale x 16 x double> @llvm.experimental.vp.reverse.nxv16f64(<vscale x 16 x double> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
call <2 x i1> @llvm.experimental.vp.reverse.v2i1(<2 x i1> poison, <2 x i1> poison, i32 poison)
call <4 x i1> @llvm.experimental.vp.reverse.v4i1(<4 x i1> poison, <4 x i1> poison, i32 poison)
diff --git a/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll b/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll
index a8c5c43..3a54428 100644
--- a/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll
@@ -4,6 +4,7 @@
define i32 @trivially_free() {
; CHECK-SIZE-LABEL: 'trivially_free'
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 1
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3)
@@ -13,14 +14,15 @@ define i32 @trivially_free() {
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; CHECK-THROUGHPUT-LABEL: 'trivially_free'
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 1
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3)
@@ -30,13 +32,14 @@ define i32 @trivially_free() {
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
+ %alloca = alloca i8
%a0 = call i32 @llvm.annotation.i32(i32 undef, ptr undef, ptr undef, i32 undef)
call void @llvm.assume(i1 undef)
call void @llvm.experimental.noalias.scope.decl(metadata !4)
@@ -46,8 +49,8 @@ define i32 @trivially_free() {
%a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
%a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
%a4 = call i1 @llvm.is.constant.i32(i32 undef)
- call void @llvm.lifetime.start.p0(i64 1, ptr undef)
- call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+ call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+ call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
%a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 1, i1 1, i1 1)
%a6 = call ptr @llvm.ptr.annotation.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
call void @llvm.var.annotation(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
diff --git a/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll b/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll
index 560af3d..96064dc 100644
--- a/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll
+++ b/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll
@@ -6,6 +6,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
define i32 @trivially_free() {
; CHECK-SIZE-LABEL: 'trivially_free'
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 4
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3)
@@ -15,8 +16,8 @@ define i32 @trivially_free() {
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a7 = call i1 @llvm.allow.ubsan.check(i8 123)
@@ -25,6 +26,7 @@ define i32 @trivially_free() {
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; CHECK-THROUGHPUT-LABEL: 'trivially_free'
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 4
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3)
@@ -34,8 +36,8 @@ define i32 @trivially_free() {
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a7 = call i1 @llvm.allow.ubsan.check(i8 123)
@@ -43,6 +45,7 @@ define i32 @trivially_free() {
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
+ %alloca = alloca i8
%a0 = call i32 @llvm.annotation.i32(i32 undef, ptr undef, ptr undef, i32 undef)
call void @llvm.assume(i1 undef)
call void @llvm.experimental.noalias.scope.decl(metadata !4)
@@ -52,8 +55,8 @@ define i32 @trivially_free() {
%a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
%a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
%a4 = call i1 @llvm.is.constant.i32(i32 undef)
- call void @llvm.lifetime.start.p0(i64 1, ptr undef)
- call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+ call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+ call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
%a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 1, i1 1, i1 1)
%a6 = call ptr @llvm.ptr.annotation.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
%a7 = call i1 @llvm.allow.ubsan.check(i8 123)
diff --git a/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll b/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll
index 53828f2..f989ebe 100644
--- a/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll
+++ b/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll
@@ -4,6 +4,7 @@
define i32 @trivially_free() {
; CHECK-SIZE-LABEL: 'trivially_free'
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 1
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3)
@@ -13,8 +14,8 @@ define i32 @trivially_free() {
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
@@ -23,6 +24,7 @@ define i32 @trivially_free() {
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; CHECK-THROUGHPUT-LABEL: 'trivially_free'
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 1
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3)
@@ -32,8 +34,8 @@ define i32 @trivially_free() {
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
@@ -41,6 +43,7 @@ define i32 @trivially_free() {
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a8 = call i1 @llvm.allow.runtime.check(metadata !"test_check")
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
+ %alloca = alloca i8
%a0 = call i32 @llvm.annotation.i32(i32 undef, ptr undef, ptr undef, i32 undef)
call void @llvm.assume(i1 undef)
call void @llvm.experimental.noalias.scope.decl(metadata !4)
@@ -50,8 +53,8 @@ define i32 @trivially_free() {
%a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
%a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
%a4 = call i1 @llvm.is.constant.i32(i32 undef)
- call void @llvm.lifetime.start.p0(i64 1, ptr undef)
- call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+ call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+ call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
%a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 1, i1 1, i1 1)
%a6 = call ptr @llvm.ptr.annotation.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
call void @llvm.var.annotation(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
index 0d1b082..311de84 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
@@ -106,10 +106,43 @@ exit:
ret void
}
+define void @backward_dep_known_safe_due_to_backedge_taken_count(ptr %A) {
+; CHECK-LABEL: 'backward_dep_known_safe_due_to_backedge_taken_count'
+; CHECK-NEXT: loop:
+; CHECK-NEXT: Memory dependences are safe
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ %A.510 = getelementptr inbounds i32, ptr %A, i64 510
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %iv.mul.2 = shl nuw nsw i64 %iv, 1
+ %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+ %l = load i32, ptr %gep, align 4
+ %add = add nsw i32 %l, 5
+ %gep.mul.2 = getelementptr inbounds i32, ptr %A.510, i64 %iv.mul.2
+ store i32 %add, ptr %gep.mul.2, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 256
+ br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+ ret void
+}
+
define void @backward_dep_known_distance_less_than_btc(ptr %A) {
; CHECK-LABEL: 'backward_dep_known_distance_less_than_btc'
; CHECK-NEXT: loop:
-; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 8160 bits
+; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 4064 bits
; CHECK-NEXT: Dependences:
; CHECK-NEXT: BackwardVectorizable:
; CHECK-NEXT: %l = load i32, ptr %gep, align 4 ->
@@ -130,10 +163,10 @@ entry:
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%iv.mul.2 = shl nuw nsw i64 %iv, 1
- %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+ %gep = getelementptr inbounds i32, ptr %A, i64 %iv.mul.2
%l = load i32, ptr %gep, align 4
%add = add nsw i32 %l, 5
- %gep.mul.2 = getelementptr inbounds i32, ptr %A.510, i64 %iv.mul.2
+ %gep.mul.2 = getelementptr inbounds i32, ptr %A.510, i64 %iv
store i32 %add, ptr %gep.mul.2, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 256
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll b/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll
index 1a6e258..468b225 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll
@@ -8,21 +8,10 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
define void @test_distance_positive_independent_via_trip_count(ptr %A) {
; CHECK-LABEL: 'test_distance_positive_independent_via_trip_count'
; CHECK-NEXT: loop:
-; CHECK-NEXT: Memory dependences are safe with run-time checks
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
-; CHECK-NEXT: Check 0:
-; CHECK-NEXT: Comparing group GRP0:
-; CHECK-NEXT: %gep.A.400 = getelementptr inbounds i32, ptr %A.400, i64 %iv
-; CHECK-NEXT: Against group GRP1:
-; CHECK-NEXT: %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv
; CHECK-NEXT: Grouped accesses:
-; CHECK-NEXT: Group GRP0:
-; CHECK-NEXT: (Low: (400 + %A)<nuw> High: (804 + %A))
-; CHECK-NEXT: Member: {(400 + %A)<nuw>,+,4}<nuw><%loop>
-; CHECK-NEXT: Group GRP1:
-; CHECK-NEXT: (Low: %A High: (101 + %A))
-; CHECK-NEXT: Member: {%A,+,1}<nuw><%loop>
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
diff --git a/llvm/test/Analysis/MemorySSA/lifetime-simple.ll b/llvm/test/Analysis/MemorySSA/lifetime-simple.ll
index d409c14..18d2459 100644
--- a/llvm/test/Analysis/MemorySSA/lifetime-simple.ll
+++ b/llvm/test/Analysis/MemorySSA/lifetime-simple.ll
@@ -2,8 +2,12 @@
; This test checks that lifetime markers are considered clobbers of %P,
; and due to lack of noalias information, of %Q as well.
-define i8 @test(ptr %P, ptr %Q) {
+declare ptr @obscure(ptr) memory(none)
+
+define i8 @test() {
entry:
+ %P = alloca [32 x i8]
+ %Q = call ptr @obscure(ptr %P)
; CHECK: 1 = MemoryDef(liveOnEntry)
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr %P)
call void @llvm.lifetime.start.p0(i64 32, ptr %P)
diff --git a/llvm/test/Analysis/MemorySSA/pr39197.ll b/llvm/test/Analysis/MemorySSA/pr39197.ll
index b52444f..af57b3c 100644
--- a/llvm/test/Analysis/MemorySSA/pr39197.ll
+++ b/llvm/test/Analysis/MemorySSA/pr39197.ll
@@ -8,6 +8,8 @@ target triple = "s390x-ibm-linux"
@1 = internal global i64 9, align 8
@g_1042 = external dso_local global [5 x i16], align 2
+declare void @dummy()
+
; CHECK-LABEL: @main()
; Function Attrs: nounwind
define dso_local void @main() #0 {
@@ -15,9 +17,6 @@ define dso_local void @main() #0 {
unreachable
}
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
-
; Function Attrs: nounwind
define dso_local void @func_1() #0 {
%1 = alloca ptr, align 8
@@ -31,7 +30,7 @@ define dso_local void @func_1() #0 {
%7 = load i64, ptr @1, align 8, !tbaa !5
%8 = and i64 %7, %6
store i64 %8, ptr @1, align 8, !tbaa !5
- call void @llvm.lifetime.end.p0(i64 4, ptr undef) #2
+ call void @dummy()
unreachable
; <label>:9: ; preds = %0
diff --git a/llvm/test/Analysis/MemorySSA/pr43044.ll b/llvm/test/Analysis/MemorySSA/pr43044.ll
index f4e0ce9..bd767d3 100644
--- a/llvm/test/Analysis/MemorySSA/pr43044.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43044.ll
@@ -47,6 +47,8 @@ cleanup1400.loopexit1: ; preds = %for.cond1050
br label %cleanup1400
cleanup1400: ; preds = %cleanup1400.loopexit1, %cleanup1400.loopexit.split
- call void @llvm.lifetime.end.p0(i64 4, ptr nonnull undef)
+ call void @dummy()
unreachable
}
+
+declare void @dummy()
diff --git a/llvm/test/Analysis/MemorySSA/pr43427.ll b/llvm/test/Analysis/MemorySSA/pr43427.ll
index a9b442c..254fb11 100644
--- a/llvm/test/Analysis/MemorySSA/pr43427.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43427.ll
@@ -30,7 +30,7 @@
; CHECK-NEXT: ; [[NO6:.*]] = MemoryDef([[NO7]])
; CHECK-NEXT: store i16 undef, ptr %e, align 1
; CHECK-NEXT: 3 = MemoryDef([[NO6]])
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr null)
+; CHECK-NEXT: call void @g()
define void @f(i1 %arg) {
entry:
@@ -57,7 +57,7 @@ cleanup: ; preds = %lbl3
br i1 %switch, label %cleanup.cont, label %lbl1
cleanup.cont: ; preds = %cleanup
- call void @llvm.lifetime.end.p0(i64 1, ptr null)
+ call void @g()
ret void
if.else: ; preds = %lbl1
@@ -65,6 +65,3 @@ if.else: ; preds = %lbl1
}
declare void @g()
-
-; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
diff --git a/llvm/test/Analysis/MemorySSA/pr43438.ll b/llvm/test/Analysis/MemorySSA/pr43438.ll
index d137c52..0e09137 100644
--- a/llvm/test/Analysis/MemorySSA/pr43438.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43438.ll
@@ -87,7 +87,7 @@ if.else: ; preds = %lbl1
]
if.end12: ; preds = %cleanup.cont11s, %cleanup.cont
- call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+ call i16 @g(i16 1)
ret void
unreachable: ; preds = %if.else, %for.end5
@@ -95,6 +95,3 @@ unreachable: ; preds = %if.else, %for.end5
}
declare i16 @g(i16)
-
-; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
diff --git a/llvm/test/Analysis/MemorySSA/renamephis.ll b/llvm/test/Analysis/MemorySSA/renamephis.ll
index 0e8cf8b..e297b99 100644
--- a/llvm/test/Analysis/MemorySSA/renamephis.ll
+++ b/llvm/test/Analysis/MemorySSA/renamephis.ll
@@ -41,7 +41,7 @@ block.exit: ; preds = %cond.exit
unreachable
sw.bb94: ; preds = %cond.exit
- call void @llvm.lifetime.end.p0(i64 8, ptr nonnull undef)
+ call void @g()
br label %cleanup
cleanup: ; preds = %sw.bb94, %cond.exit, %cond.exit
diff --git a/llvm/test/Analysis/ScalarEvolution/add-expr-pointer-operand-sorting.ll b/llvm/test/Analysis/ScalarEvolution/add-expr-pointer-operand-sorting.ll
index 1799d15..39b475d 100644
--- a/llvm/test/Analysis/ScalarEvolution/add-expr-pointer-operand-sorting.ll
+++ b/llvm/test/Analysis/ScalarEvolution/add-expr-pointer-operand-sorting.ll
@@ -21,28 +21,26 @@ define i32 @d(i32 %base) {
; CHECK-NEXT: Classifying expressions for: @d
; CHECK-NEXT: %e = alloca [1 x [1 x i8]], align 1
; CHECK-NEXT: --> %e U: full-set S: full-set
-; CHECK-NEXT: %0 = bitcast ptr %e to ptr
-; CHECK-NEXT: --> %e U: full-set S: full-set
; CHECK-NEXT: %f.0 = phi i32 [ %base, %entry ], [ %inc, %for.cond ]
; CHECK-NEXT: --> {%base,+,1}<nsw><%for.cond> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.cond: Computable }
; CHECK-NEXT: %idxprom = sext i32 %f.0 to i64
; CHECK-NEXT: --> {(sext i32 %base to i64),+,1}<nsw><%for.cond> U: [-2147483648,-9223372036854775808) S: [-2147483648,-9223372036854775808) Exits: <<Unknown>> LoopDispositions: { %for.cond: Computable }
; CHECK-NEXT: %arrayidx = getelementptr inbounds [1 x [1 x i8]], ptr %e, i64 0, i64 %idxprom
; CHECK-NEXT: --> {((sext i32 %base to i64) + %e),+,1}<nw><%for.cond> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.cond: Computable }
-; CHECK-NEXT: %1 = load ptr, ptr @c, align 8
-; CHECK-NEXT: --> %1 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.cond: Variant }
-; CHECK-NEXT: %sub.ptr.lhs.cast = ptrtoint ptr %1 to i64
-; CHECK-NEXT: --> (ptrtoint ptr %1 to i64) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.cond: Variant }
+; CHECK-NEXT: %load1 = load ptr, ptr @c, align 8
+; CHECK-NEXT: --> %load1 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.cond: Variant }
+; CHECK-NEXT: %sub.ptr.lhs.cast = ptrtoint ptr %load1 to i64
+; CHECK-NEXT: --> (ptrtoint ptr %load1 to i64) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.cond: Variant }
; CHECK-NEXT: %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, ptrtoint (ptr @b to i64)
-; CHECK-NEXT: --> ((-1 * (ptrtoint ptr @b to i64)) + (ptrtoint ptr %1 to i64)) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.cond: Variant }
+; CHECK-NEXT: --> ((-1 * (ptrtoint ptr @b to i64)) + (ptrtoint ptr %load1 to i64)) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.cond: Variant }
; CHECK-NEXT: %sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 4
; CHECK-NEXT: --> %sub.ptr.div U: [-2305843009213693952,2305843009213693952) S: [-2305843009213693952,2305843009213693952) Exits: <<Unknown>> LoopDispositions: { %for.cond: Variant }
; CHECK-NEXT: %arrayidx1 = getelementptr inbounds [1 x i8], ptr %arrayidx, i64 0, i64 %sub.ptr.div
; CHECK-NEXT: --> ({((sext i32 %base to i64) + %e),+,1}<nw><%for.cond> + %sub.ptr.div) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.cond: Variant }
-; CHECK-NEXT: %2 = load i8, ptr %arrayidx1, align 1
-; CHECK-NEXT: --> %2 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.cond: Variant }
-; CHECK-NEXT: %conv = sext i8 %2 to i32
-; CHECK-NEXT: --> (sext i8 %2 to i32) U: [-128,128) S: [-128,128) Exits: <<Unknown>> LoopDispositions: { %for.cond: Variant }
+; CHECK-NEXT: %load2 = load i8, ptr %arrayidx1, align 1
+; CHECK-NEXT: --> %load2 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.cond: Variant }
+; CHECK-NEXT: %conv = sext i8 %load2 to i32
+; CHECK-NEXT: --> (sext i8 %load2 to i32) U: [-128,128) S: [-128,128) Exits: <<Unknown>> LoopDispositions: { %for.cond: Variant }
; CHECK-NEXT: %inc = add nsw i32 %f.0, 1
; CHECK-NEXT: --> {(1 + %base),+,1}<nw><%for.cond> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.cond: Computable }
; CHECK-NEXT: Determining loop execution counts for: @d
@@ -52,21 +50,20 @@ define i32 @d(i32 %base) {
;
entry:
%e = alloca [1 x [1 x i8]], align 1
- %0 = bitcast ptr %e to ptr
- call void @llvm.lifetime.start.p0(i64 1, ptr %0) #2
+ call void @llvm.lifetime.start.p0(i64 1, ptr %e) #2
br label %for.cond
for.cond: ; preds = %for.cond, %entry
%f.0 = phi i32 [ %base, %entry ], [ %inc, %for.cond ]
%idxprom = sext i32 %f.0 to i64
%arrayidx = getelementptr inbounds [1 x [1 x i8]], ptr %e, i64 0, i64 %idxprom
- %1 = load ptr, ptr @c, align 8
- %sub.ptr.lhs.cast = ptrtoint ptr %1 to i64
+ %load1 = load ptr, ptr @c, align 8
+ %sub.ptr.lhs.cast = ptrtoint ptr %load1 to i64
%sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, ptrtoint (ptr @b to i64)
%sub.ptr.div = sdiv exact i64 %sub.ptr.sub, 4
%arrayidx1 = getelementptr inbounds [1 x i8], ptr %arrayidx, i64 0, i64 %sub.ptr.div
- %2 = load i8, ptr %arrayidx1, align 1
- %conv = sext i8 %2 to i32
+ %load2 = load i8, ptr %arrayidx1, align 1
+ %conv = sext i8 %load2 to i32
store i32 %conv, ptr @a, align 4
%inc = add nsw i32 %f.0, 1
br label %for.cond
diff --git a/llvm/test/Analysis/ScalarEvolution/sdiv.ll b/llvm/test/Analysis/ScalarEvolution/sdiv.ll
index e01f84f..9eaaf8b 100644
--- a/llvm/test/Analysis/ScalarEvolution/sdiv.ll
+++ b/llvm/test/Analysis/ScalarEvolution/sdiv.ll
@@ -38,7 +38,7 @@ define dso_local void @_Z4loopi(i32 %width) local_unnamed_addr #0 {
entry:
%storage = alloca [2 x i32], align 4
%0 = bitcast ptr %storage to ptr
- call void @llvm.lifetime.start.p0(i64 8, ptr %0) #4
+ call void @llvm.lifetime.start.p0(i64 8, ptr %storage) #4
call void @llvm.memset.p0.i64(ptr align 4 %0, i8 0, i64 8, i1 false)
br label %for.cond
@@ -48,7 +48,7 @@ for.cond:
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup:
- call void @llvm.lifetime.end.p0(i64 8, ptr %0) #4
+ call void @llvm.lifetime.end.p0(i64 8, ptr %storage) #4
ret void
for.body:
diff --git a/llvm/test/Analysis/ScalarEvolution/srem.ll b/llvm/test/Analysis/ScalarEvolution/srem.ll
index ff898c9..377e58a 100644
--- a/llvm/test/Analysis/ScalarEvolution/srem.ll
+++ b/llvm/test/Analysis/ScalarEvolution/srem.ll
@@ -38,7 +38,7 @@ define dso_local void @_Z4loopi(i32 %width) local_unnamed_addr #0 {
entry:
%storage = alloca [2 x i32], align 4
%0 = bitcast ptr %storage to ptr
- call void @llvm.lifetime.start.p0(i64 8, ptr %0) #4
+ call void @llvm.lifetime.start.p0(i64 8, ptr %storage) #4
call void @llvm.memset.p0.i64(ptr align 4 %0, i8 0, i64 8, i1 false)
br label %for.cond
@@ -48,7 +48,7 @@ for.cond:
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup:
- call void @llvm.lifetime.end.p0(i64 8, ptr %0) #4
+ call void @llvm.lifetime.end.p0(i64 8, ptr %storage) #4
ret void
for.body:
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll b/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll
index 37fa7d3e..7fa1cf4 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll
@@ -786,83 +786,6 @@ end:
ret void
}
-define void @non_alloca(ptr %p) {
-; CHECK-LABEL: define void @non_alloca
-entry:
-; CHECK: entry:
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
- %x = alloca i8, align 4
- %y = alloca i8, align 4
-
- call void @llvm.lifetime.start.p0(i64 4, ptr %p)
-; CHECK: call void @llvm.lifetime.start.p0(i64 4, ptr %p)
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-
- call void @llvm.lifetime.start.p0(i64 4, ptr %x)
-; CHECK: call void @llvm.lifetime.start.p0(i64 4, ptr %x)
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-
- call void @llvm.lifetime.end.p0(i64 4, ptr %p)
-; CHECK: call void @llvm.lifetime.end.p0(i64 4, ptr %p)
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-
- ret void
-}
-
-define void @select_alloca(i1 %v) {
-; CHECK-LABEL: define void @select_alloca
-entry:
-; CHECK: entry:
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
- %x = alloca i8, align 4
- %y = alloca i8, align 4
- %cxcy = select i1 %v, ptr %x, ptr %y
-
- call void @llvm.lifetime.start.p0(i64 1, ptr %cxcy)
-; CHECK: call void @llvm.lifetime.start.p0(i64 1, ptr %cxcy)
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-
- call void @llvm.lifetime.start.p0(i64 1, ptr %x)
-; CHECK: call void @llvm.lifetime.start.p0(i64 1, ptr %x)
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-
- call void @llvm.lifetime.end.p0(i64 1, ptr %x)
-; CHECK: call void @llvm.lifetime.end.p0(i64 1, ptr %x)
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-
- ret void
-}
-
-define void @alloca_offset() {
-; CHECK-LABEL: define void @alloca_offset
-entry:
-; CHECK: entry:
-; MAY-NEXT: Alive: <x>
-; MUST-NEXT: Alive: <>
- %x = alloca [5 x i32], align 4
- %x2 = getelementptr [5 x i32], ptr %x, i64 0, i64 1
-
- call void @llvm.lifetime.start.p0(i64 20, ptr %x2)
-; CHECK: call void @llvm.lifetime.start.p0(i64 20, ptr %x2)
-; MAY-NEXT: Alive: <x>
-; MUST-NEXT: Alive: <>
-
- call void @llvm.lifetime.end.p0(i64 20, ptr %x2)
-; CHECK: call void @llvm.lifetime.end.p0(i64 20, ptr %x2)
-; MAY-NEXT: Alive: <x>
-; MUST-NEXT: Alive: <>
-
- ret void
-}
-
define void @alloca_size() {
; CHECK-LABEL: define void @alloca_size
entry:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 705c128..10c656a 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -302,6 +302,14 @@ define amdgpu_kernel void @wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8
ret void
}
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+define amdgpu_ps void @wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
; CHRCK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 false, <16 x half> %A, i1 false, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
define amdgpu_ps void @swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
%tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
@@ -836,6 +844,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>,
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1)
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index a17f11a..362586a 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -17,6 +17,8 @@ declare float @llvm.nvvm.fabs.f(float)
declare float @llvm.nvvm.fabs.ftz.f(float)
declare double @llvm.nvvm.fabs.d(double)
+declare float @llvm.nvvm.tanh.approx.f32(float)
+
declare i16 @llvm.nvvm.max.s(i16, i16)
declare i32 @llvm.nvvm.max.i(i32, i32)
declare i64 @llvm.nvvm.max.ll(i64, i64)
@@ -138,6 +140,13 @@ define void @fabs(float %a, double %b) {
ret void
}
+; CHECK-LABEL: @tanh
+define void @tanh(float %a) {
+; CHECK: call afn float @llvm.tanh.f32(float %a)
+ %r1 = call float @llvm.nvvm.tanh.approx.f32(float %a)
+ ret void
+}
+
; CHECK-LABEL: @min_max
define void @min_max(i16 %a1, i16 %a2, i32 %b1, i32 %b2, i64 %c1, i64 %c2) {
; CHECK: [[maxs:%[a-zA-Z0-9.]+]] = icmp sge i16 %a1, %a2
diff --git a/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll b/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll
new file mode 100644
index 0000000..00ab934
--- /dev/null
+++ b/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S < %s | FileCheck %s
+
+define void @strip_bitcast() {
+; CHECK-LABEL: define void @strip_bitcast() {
+; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT: [[B:%.*]] = bitcast ptr [[A]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[A]])
+; CHECK-NEXT: ret void
+;
+ %a = alloca i8
+ %b = bitcast ptr %a to ptr
+ call void @llvm.lifetime.start.p0(i64 1, ptr %b)
+ call void @llvm.lifetime.end.p0(i64 1, ptr %b)
+ ret void
+}
+
+define void @strip_addrspacecast() {
+; CHECK-LABEL: define void @strip_addrspacecast() {
+; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT: [[B:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[A]])
+; CHECK-NEXT: ret void
+;
+ %a = alloca i8
+ %b = addrspacecast ptr %a to ptr addrspace(1)
+ call void @llvm.lifetime.start.p1(i64 1, ptr addrspace(1) %b)
+ call void @llvm.lifetime.end.p1(i64 1, ptr addrspace(1) %b)
+ ret void
+}
+
+define void @strip_gep() {
+; CHECK-LABEL: define void @strip_gep() {
+; CHECK-NEXT: [[A:%.*]] = alloca [2 x i8], align 1
+; CHECK-NEXT: [[B:%.*]] = getelementptr [2 x i8], ptr [[A]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[A]])
+; CHECK-NEXT: ret void
+;
+ %a = alloca [2 x i8]
+ %b = getelementptr [2 x i8], ptr %a, i64 0, i64 0
+ call void @llvm.lifetime.start.p0(i64 1, ptr %b)
+ call void @llvm.lifetime.end.p0(i64 1, ptr %b)
+ ret void
+}
+
+define void @remove_unanalyzable(ptr %p) {
+; CHECK-LABEL: define void @remove_unanalyzable(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT: ret void
+;
+ call void @llvm.lifetime.start.p0(i64 1, ptr %p)
+ call void @llvm.lifetime.end.p0(i64 1, ptr %p)
+ ret void
+}
diff --git a/llvm/test/Assembler/difile-empty-source.ll b/llvm/test/Assembler/difile-empty-source.ll
new file mode 100644
index 0000000..11587d8
--- /dev/null
+++ b/llvm/test/Assembler/difile-empty-source.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder
+
+; CHECK: !DIFile({{.*}}, source: "")
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, emissionKind: FullDebug)
+!1 = !DIFile(filename: "-", directory: "/", checksumkind: CSK_MD5, checksum: "d41d8cd98f00b204e9800998ecf8427e", source: "")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index 9cf3fdb..0b5ce08 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -564,6 +564,10 @@ declare riscv_vls_cc(32768) void @riscv_vls_cc_32768()
; CHECK: declare riscv_vls_cc(32768) void @riscv_vls_cc_32768()
declare riscv_vls_cc(65536) void @riscv_vls_cc_65536()
; CHECK: declare riscv_vls_cc(65536) void @riscv_vls_cc_65536()
+declare cc124 void @f.cc124(i1)
+; CHECK: declare amdgpu_gfx_whole_wave void @f.cc124(i1)
+declare amdgpu_gfx_whole_wave void @f.amdgpu_gfx_whole_wave(i1)
+; CHECK: declare amdgpu_gfx_whole_wave void @f.amdgpu_gfx_whole_wave(i1)
declare cc1023 void @f.cc1023()
; CHECK: declare cc1023 void @f.cc1023()
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-gep-flags.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-gep-flags.ll
index 34ac4f6..8a6f266 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-gep-flags.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-gep-flags.ll
@@ -17,8 +17,8 @@ define i32 @gep_nusw_nuw(ptr %ptr, i32 %idx) {
; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]]
; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL1]](s64)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
- ; CHECK-NEXT: %11:_(p0) = nuw nusw G_PTR_ADD [[PTR_ADD1]], [[C1]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD %11(p0) :: (load (s32) from %ir.gep2)
+ ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = nuw nusw G_PTR_ADD [[PTR_ADD1]], [[C1]](s64)
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from %ir.gep2)
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD]], [[LOAD1]]
; CHECK-NEXT: $w0 = COPY [[ADD]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
@@ -77,8 +77,8 @@ define i32 @gep_nusw(ptr %ptr, i32 %idx) {
; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]]
; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL1]](s64)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
- ; CHECK-NEXT: %11:_(p0) = nusw G_PTR_ADD [[PTR_ADD1]], [[C1]](s64)
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD %11(p0) :: (load (s32) from %ir.gep2)
+ ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = nusw G_PTR_ADD [[PTR_ADD1]], [[C1]](s64)
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from %ir.gep2)
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD]], [[LOAD1]]
; CHECK-NEXT: $w0 = COPY [[ADD]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll
index 55cf48e..d1a6584a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll
@@ -9,7 +9,7 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
declare i32 @logg(...)
-define i32 @scanfile(i32 %call148) {
+define i32 @scanfile(i32 %call148, ptr %p) {
; CHECK-LABEL: scanfile:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
@@ -26,7 +26,7 @@ define i32 @scanfile(i32 %call148) {
; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
; CHECK-NEXT: ret
; CHECK-NEXT: LBB0_3: ; %entry
-; CHECK-NEXT: b.eq LBB0_2
+; CHECK-NEXT: b.eq LBB0_10
; CHECK-NEXT: ; %bb.4: ; %entry
; CHECK-NEXT: cmp w8, #2
; CHECK-NEXT: b.eq LBB0_6
@@ -46,6 +46,10 @@ define i32 @scanfile(i32 %call148) {
; CHECK-NEXT: LBB0_9: ; %sw.bb150
; CHECK-NEXT: bl _logg
; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: LBB0_10: ; %sw.bb178
+; CHECK-NEXT: str wzr, [x1]
+; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-NEXT: ret
entry:
switch i32 %call148, label %common.ret [
i32 -1, label %sw.bb
@@ -80,7 +84,7 @@ sw.bb152: ; preds = %entry
br label %common.ret
sw.bb178: ; preds = %entry
- call void @llvm.lifetime.start.p0(i64 0, ptr null)
+ store i32 0, ptr %p
br label %common.ret
}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
new file mode 100644
index 0000000..8552931
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
@@ -0,0 +1,109 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple aarch64 -passes="print<gisel-value-tracking>" %s -o - 2>&1 | FileCheck %s
+
+---
+name: Cst
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @Cst
+ ; CHECK-NEXT: %0:_ KnownBits:10000000 SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+ ; CHECK-NEXT: %2:_ KnownBits:11110000 SignBits:4
+ %0:_(s8) = G_CONSTANT i8 128
+ %1:_(s8) = G_CONSTANT i8 3
+ %2:_(s8) = G_ASHR %0, %1
+...
+---
+name: CstBig
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @CstBig
+ ; CHECK-NEXT: %0:_ KnownBits:11111000 SignBits:5
+ ; CHECK-NEXT: %1:_ KnownBits:00000110 SignBits:5
+ ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8
+ %0:_(s8) = G_CONSTANT i8 248
+ %1:_(s8) = G_CONSTANT i8 6
+ %2:_(s8) = G_ASHR %0, %1
+...
+---
+name: ScalarVar
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarVar
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = COPY $b1
+ %2:_(s8) = G_ASHR %0, %1
+...
+---
+name: ScalarCst
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarCst
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:4
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 3
+ %2:_(s8) = G_ASHR %0, %1
+...
+---
+name: VectorVar
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorVar
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(<4 x s16>) = COPY $d1
+ %2:_(<4 x s16>) = G_ASHR %0, %1
+...
+---
+name: VectorCst
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCst
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:4
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 3
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %3:_(<4 x s16>) = G_ASHR %0, %2
+...
+---
+name: VectorCst36
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCst36
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000110 SignBits:13
+ ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13
+ ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:4
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 3
+ %2:_(s16) = G_CONSTANT i16 6
+ %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
+ %4:_(<4 x s16>) = G_ASHR %0, %3
+...
+---
+name: VectorCst3unknown
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCst3unknown
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %2:_(s16) = COPY $h0
+ %1:_(s16) = G_CONSTANT i16 3
+ %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
+ %4:_(<4 x s16>) = G_ASHR %0, %3
+...
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
index be79135..747db39 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -14,10 +14,10 @@ define <8 x i16> @dupsext_v8i8_v8i16(i8 %src, <8 x i8> %b) {
; CHECK-GI-LABEL: dupsext_v8i8_v8i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: lsl w8, w0, #8
-; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-NEXT: sbfx w8, w8, #8, #8
; CHECK-GI-NEXT: dup v1.8h, w8
-; CHECK-GI-NEXT: mul v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT: xtn v1.8b, v1.8h
+; CHECK-GI-NEXT: smull v0.8h, v1.8b, v0.8b
; CHECK-GI-NEXT: ret
entry:
%in = sext i8 %src to i16
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops.ll b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
index ff7872c..83530049a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mops.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
@@ -87,46 +87,17 @@ entry:
}
define void @memset_10_zeroval_volatile(ptr %dst) {
-; GISel-WITHOUT-MOPS-O0-LABEL: memset_10_zeroval_volatile:
-; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w1, wzr
-; GISel-WITHOUT-MOPS-O0-NEXT: bl memset
-; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISel-WITHOUT-MOPS-O0-NEXT: ret
-;
-; GISel-WITHOUT-MOPS-O3-LABEL: memset_10_zeroval_volatile:
-; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT: mov w1, wzr
-; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa
-; GISel-WITHOUT-MOPS-O3-NEXT: bl memset
-; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISel-WITHOUT-MOPS-O3-NEXT: ret
-;
-; GISel-MOPS-O0-LABEL: memset_10_zeroval_volatile:
-; GISel-MOPS-O0: // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8
-; GISel-MOPS-O0-NEXT: mov x9, xzr
-; GISel-MOPS-O0-NEXT: setp [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT: setm [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT: sete [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT: ret
+; GISel-WITHOUT-MOPS-LABEL: memset_10_zeroval_volatile:
+; GISel-WITHOUT-MOPS: // %bb.0: // %entry
+; GISel-WITHOUT-MOPS-NEXT: str xzr, [x0]
+; GISel-WITHOUT-MOPS-NEXT: strh wzr, [x0, #8]
+; GISel-WITHOUT-MOPS-NEXT: ret
;
-; GISel-MOPS-O3-LABEL: memset_10_zeroval_volatile:
-; GISel-MOPS-O3: // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O3-NEXT: setp [x0]!, x8!, xzr
-; GISel-MOPS-O3-NEXT: setm [x0]!, x8!, xzr
-; GISel-MOPS-O3-NEXT: sete [x0]!, x8!, xzr
-; GISel-MOPS-O3-NEXT: ret
+; GISel-MOPS-LABEL: memset_10_zeroval_volatile:
+; GISel-MOPS: // %bb.0: // %entry
+; GISel-MOPS-NEXT: str xzr, [x0]
+; GISel-MOPS-NEXT: strh wzr, [x0, #8]
+; GISel-MOPS-NEXT: ret
;
; SDAG-WITHOUT-MOPS-O2-LABEL: memset_10_zeroval_volatile:
; SDAG-WITHOUT-MOPS-O2: // %bb.0: // %entry
@@ -490,43 +461,46 @@ entry:
define void @memset_10_volatile(ptr %dst, i32 %value) {
; GISel-WITHOUT-MOPS-O0-LABEL: memset_10_volatile:
; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8
-; GISel-WITHOUT-MOPS-O0-NEXT: bl memset
-; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISel-WITHOUT-MOPS-O0-NEXT: // implicit-def: $x8
+; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, w1
+; GISel-WITHOUT-MOPS-O0-NEXT: and x8, x8, #0xff
+; GISel-WITHOUT-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101
+; GISel-WITHOUT-MOPS-O0-NEXT: mul x8, x8, x9
+; GISel-WITHOUT-MOPS-O0-NEXT: str x8, [x0]
+; GISel-WITHOUT-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8
+; GISel-WITHOUT-MOPS-O0-NEXT: strh w8, [x0, #8]
; GISel-WITHOUT-MOPS-O0-NEXT: ret
;
; GISel-WITHOUT-MOPS-O3-LABEL: memset_10_volatile:
; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa
-; GISel-WITHOUT-MOPS-O3-NEXT: bl memset
-; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISel-WITHOUT-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1
+; GISel-WITHOUT-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101
+; GISel-WITHOUT-MOPS-O3-NEXT: and x9, x1, #0xff
+; GISel-WITHOUT-MOPS-O3-NEXT: mul x8, x9, x8
+; GISel-WITHOUT-MOPS-O3-NEXT: str x8, [x0]
+; GISel-WITHOUT-MOPS-O3-NEXT: strh w8, [x0, #8]
; GISel-WITHOUT-MOPS-O3-NEXT: ret
;
; GISel-MOPS-O0-LABEL: memset_10_volatile:
; GISel-MOPS-O0: // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8
-; GISel-MOPS-O0-NEXT: // implicit-def: $x9
-; GISel-MOPS-O0-NEXT: mov w9, w1
-; GISel-MOPS-O0-NEXT: setp [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT: setm [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT: sete [x0]!, x8!, x9
+; GISel-MOPS-O0-NEXT: // implicit-def: $x8
+; GISel-MOPS-O0-NEXT: mov w8, w1
+; GISel-MOPS-O0-NEXT: and x8, x8, #0xff
+; GISel-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101
+; GISel-MOPS-O0-NEXT: mul x8, x8, x9
+; GISel-MOPS-O0-NEXT: str x8, [x0]
+; GISel-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8
+; GISel-MOPS-O0-NEXT: strh w8, [x0, #8]
; GISel-MOPS-O0-NEXT: ret
;
; GISel-MOPS-O3-LABEL: memset_10_volatile:
; GISel-MOPS-O3: // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa
; GISel-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1
-; GISel-MOPS-O3-NEXT: setp [x0]!, x8!, x1
-; GISel-MOPS-O3-NEXT: setm [x0]!, x8!, x1
-; GISel-MOPS-O3-NEXT: sete [x0]!, x8!, x1
+; GISel-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101
+; GISel-MOPS-O3-NEXT: and x9, x1, #0xff
+; GISel-MOPS-O3-NEXT: mul x8, x9, x8
+; GISel-MOPS-O3-NEXT: str x8, [x0]
+; GISel-MOPS-O3-NEXT: strh w8, [x0, #8]
; GISel-MOPS-O3-NEXT: ret
;
; SDAG-WITHOUT-MOPS-O2-LABEL: memset_10_volatile:
@@ -905,43 +879,21 @@ entry:
}
define void @memcpy_10_volatile(ptr %dst, ptr %src, i32 %value) {
-; GISel-WITHOUT-MOPS-O0-LABEL: memcpy_10_volatile:
-; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8
-; GISel-WITHOUT-MOPS-O0-NEXT: bl memcpy
-; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISel-WITHOUT-MOPS-O0-NEXT: ret
-;
-; GISel-WITHOUT-MOPS-O3-LABEL: memcpy_10_volatile:
-; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa
-; GISel-WITHOUT-MOPS-O3-NEXT: bl memcpy
-; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISel-WITHOUT-MOPS-O3-NEXT: ret
-;
-; GISel-MOPS-O0-LABEL: memcpy_10_volatile:
-; GISel-MOPS-O0: // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8
-; GISel-MOPS-O0-NEXT: cpyfp [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT: cpyfm [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT: cpyfe [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT: ret
+; GISel-WITHOUT-MOPS-LABEL: memcpy_10_volatile:
+; GISel-WITHOUT-MOPS: // %bb.0: // %entry
+; GISel-WITHOUT-MOPS-NEXT: ldr x8, [x1]
+; GISel-WITHOUT-MOPS-NEXT: str x8, [x0]
+; GISel-WITHOUT-MOPS-NEXT: ldrh w8, [x1, #8]
+; GISel-WITHOUT-MOPS-NEXT: strh w8, [x0, #8]
+; GISel-WITHOUT-MOPS-NEXT: ret
;
-; GISel-MOPS-O3-LABEL: memcpy_10_volatile:
-; GISel-MOPS-O3: // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O3-NEXT: cpyfp [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT: cpyfm [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT: cpyfe [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT: ret
+; GISel-MOPS-LABEL: memcpy_10_volatile:
+; GISel-MOPS: // %bb.0: // %entry
+; GISel-MOPS-NEXT: ldr x8, [x1]
+; GISel-MOPS-NEXT: str x8, [x0]
+; GISel-MOPS-NEXT: ldrh w8, [x1, #8]
+; GISel-MOPS-NEXT: strh w8, [x0, #8]
+; GISel-MOPS-NEXT: ret
;
; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_10_volatile:
; SDAG-WITHOUT-MOPS-O2: // %bb.0: // %entry
@@ -1736,40 +1688,34 @@ entry:
define void @memmove_10_volatile(ptr %dst, ptr %src, i32 %value) {
; GISel-WITHOUT-MOPS-O0-LABEL: memmove_10_volatile:
; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8
-; GISel-WITHOUT-MOPS-O0-NEXT: bl memmove
-; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISel-WITHOUT-MOPS-O0-NEXT: ldr x9, [x1]
+; GISel-WITHOUT-MOPS-O0-NEXT: ldrh w8, [x1, #8]
+; GISel-WITHOUT-MOPS-O0-NEXT: str x9, [x0]
+; GISel-WITHOUT-MOPS-O0-NEXT: strh w8, [x0, #8]
; GISel-WITHOUT-MOPS-O0-NEXT: ret
;
; GISel-WITHOUT-MOPS-O3-LABEL: memmove_10_volatile:
; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa
-; GISel-WITHOUT-MOPS-O3-NEXT: bl memmove
-; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISel-WITHOUT-MOPS-O3-NEXT: ldr x8, [x1]
+; GISel-WITHOUT-MOPS-O3-NEXT: ldrh w9, [x1, #8]
+; GISel-WITHOUT-MOPS-O3-NEXT: str x8, [x0]
+; GISel-WITHOUT-MOPS-O3-NEXT: strh w9, [x0, #8]
; GISel-WITHOUT-MOPS-O3-NEXT: ret
;
; GISel-MOPS-O0-LABEL: memmove_10_volatile:
; GISel-MOPS-O0: // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8
-; GISel-MOPS-O0-NEXT: cpyp [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT: cpym [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT: cpye [x0]!, [x1]!, x8!
+; GISel-MOPS-O0-NEXT: ldr x9, [x1]
+; GISel-MOPS-O0-NEXT: ldrh w8, [x1, #8]
+; GISel-MOPS-O0-NEXT: str x9, [x0]
+; GISel-MOPS-O0-NEXT: strh w8, [x0, #8]
; GISel-MOPS-O0-NEXT: ret
;
; GISel-MOPS-O3-LABEL: memmove_10_volatile:
; GISel-MOPS-O3: // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa
-; GISel-MOPS-O3-NEXT: cpyp [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT: cpym [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT: cpye [x0]!, [x1]!, x8!
+; GISel-MOPS-O3-NEXT: ldr x8, [x1]
+; GISel-MOPS-O3-NEXT: ldrh w9, [x1, #8]
+; GISel-MOPS-O3-NEXT: str x8, [x0]
+; GISel-MOPS-O3-NEXT: strh w9, [x0, #8]
; GISel-MOPS-O3-NEXT: ret
;
; SDAG-WITHOUT-MOPS-O2-LABEL: memmove_10_volatile:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 2f23a32..6e5c666 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -2264,33 +2264,12 @@ define <2 x i64> @lsr_const(<2 x i64> %a, <2 x i64> %b) {
}
define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-NEON-LABEL: asr:
-; CHECK-NEON: // %bb.0:
-; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #32
-; CHECK-NEON-NEXT: shrn v1.2s, v1.2d, #32
-; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT: ret
-;
-; CHECK-SVE-LABEL: asr:
-; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: shrn v0.2s, v0.2d, #32
-; CHECK-SVE-NEXT: shrn v1.2s, v1.2d, #32
-; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-SVE-NEXT: ret
-;
-; CHECK-GI-LABEL: asr:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #32
-; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #32
-; CHECK-GI-NEXT: fmov x10, d0
-; CHECK-GI-NEXT: fmov x11, d1
-; CHECK-GI-NEXT: mov x8, v0.d[1]
-; CHECK-GI-NEXT: mov x9, v1.d[1]
-; CHECK-GI-NEXT: mul x10, x10, x11
-; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: fmov d0, x10
-; CHECK-GI-NEXT: mov v0.d[1], x8
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: asr:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shrn v0.2s, v0.2d, #32
+; CHECK-NEXT: shrn v1.2s, v1.2d, #32
+; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: ret
%x = ashr <2 x i64> %a, <i64 32, i64 32>
%y = ashr <2 x i64> %b, <i64 32, i64 32>
%z = mul nsw <2 x i64> %x, %y
@@ -2298,34 +2277,12 @@ define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) {
}
define <2 x i64> @asr_const(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-NEON-LABEL: asr_const:
-; CHECK-NEON: // %bb.0:
-; CHECK-NEON-NEXT: movi v1.2s, #31
-; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #32
-; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT: ret
-;
-; CHECK-SVE-LABEL: asr_const:
-; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: movi v1.2s, #31
-; CHECK-SVE-NEXT: shrn v0.2s, v0.2d, #32
-; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-SVE-NEXT: ret
-;
-; CHECK-GI-LABEL: asr_const:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: adrp x8, .LCPI81_0
-; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #32
-; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI81_0]
-; CHECK-GI-NEXT: fmov x10, d0
-; CHECK-GI-NEXT: fmov x11, d1
-; CHECK-GI-NEXT: mov x8, v0.d[1]
-; CHECK-GI-NEXT: mov x9, v1.d[1]
-; CHECK-GI-NEXT: mul x10, x10, x11
-; CHECK-GI-NEXT: mul x8, x8, x9
-; CHECK-GI-NEXT: fmov d0, x10
-; CHECK-GI-NEXT: mov v0.d[1], x8
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: asr_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.2s, #31
+; CHECK-NEXT: shrn v0.2s, v0.2d, #32
+; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: ret
%x = ashr <2 x i64> %a, <i64 32, i64 32>
%z = mul nsw <2 x i64> %x, <i64 31, i64 31>
ret <2 x i64> %z
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
index e31c9a0..113eb14 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
@@ -263,3 +263,110 @@ entry:
%conv = zext i1 %cmp to i8
ret i8 %conv
}
+
+; Test ANDS.
+define i32 @test1_ands(i32 %a) {
+; CHECK-LABEL: test1_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and w8, w0, #0x3ffc00
+; CHECK-NEXT: ands w8, w8, #0xffe007ff
+; CHECK-NEXT: csel w0, w0, w8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i32 %a, 2098176
+ %c = icmp eq i32 %ands, 0
+ %r = select i1 %c, i32 %a, i32 %ands
+ ret i32 %r
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i32 @test2_ands(i32 %a) {
+; CHECK-LABEL: test2_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135 // =0x87
+; CHECK-NEXT: ands w8, w0, w8
+; CHECK-NEXT: csel w0, w0, w8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i32 %a, 135
+ %c = icmp eq i32 %ands, 0
+ %r = select i1 %c, i32 %a, i32 %ands
+ ret i32 %r
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i32 @test3_ands(i32 %a) {
+; CHECK-LABEL: test3_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: ands w8, w0, w8
+; CHECK-NEXT: csel w0, w0, w8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i32 %a, 2163712
+ %c = icmp eq i32 %ands, 0
+ %r = select i1 %c, i32 %a, i32 %ands
+ ret i32 %r
+}
+
+define i64 @test4_ands(i64 %a) {
+; CHECK-LABEL: test4_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and x8, x0, #0x3ffc00
+; CHECK-NEXT: ands x8, x8, #0xffffffffffe007ff
+; CHECK-NEXT: csel x0, x0, x8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i64 %a, 2098176
+ %c = icmp eq i64 %ands, 0
+ %r = select i1 %c, i64 %a, i64 %ands
+ ret i64 %r
+}
+
+define i64 @test5_ands(i64 %a) {
+; CHECK-LABEL: test5_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and x8, x0, #0x3ffffc000
+; CHECK-NEXT: ands x8, x8, #0xfffffffe00007fff
+; CHECK-NEXT: csel x0, x0, x8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i64 %a, 8589950976
+ %c = icmp eq i64 %ands, 0
+ %r = select i1 %c, i64 %a, i64 %ands
+ ret i64 %r
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i64 @test6_ands(i64 %a) {
+; CHECK-LABEL: test6_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135 // =0x87
+; CHECK-NEXT: ands x8, x0, x8
+; CHECK-NEXT: csel x0, x0, x8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i64 %a, 135
+ %c = icmp eq i64 %ands, 0
+ %r = select i1 %c, i64 %a, i64 %ands
+ ret i64 %r
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i64 @test7_ands(i64 %a) {
+; CHECK-LABEL: test7_ands:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: ands x8, x0, x8
+; CHECK-NEXT: csel x0, x0, x8, eq
+; CHECK-NEXT: ret
+entry:
+ %ands = and i64 %a, 2163712
+ %c = icmp eq i64 %ands, 0
+ %r = select i1 %c, i64 %a, i64 %ands
+ ret i64 %r
+}
diff --git a/llvm/test/CodeGen/AArch64/abds-neg.ll b/llvm/test/CodeGen/AArch64/abds-neg.ll
index ac7cb1f..432ffc3 100644
--- a/llvm/test/CodeGen/AArch64/abds-neg.ll
+++ b/llvm/test/CodeGen/AArch64/abds-neg.ll
@@ -200,8 +200,7 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
; CHECK-NEXT: subs x8, x0, x2
; CHECK-NEXT: sbc x9, x1, x3
; CHECK-NEXT: subs x10, x2, x0
-; CHECK-NEXT: sbc x11, x3, x1
-; CHECK-NEXT: sbcs xzr, x3, x1
+; CHECK-NEXT: sbcs x11, x3, x1
; CHECK-NEXT: csel x8, x8, x10, lt
; CHECK-NEXT: csel x9, x9, x11, lt
; CHECK-NEXT: negs x0, x8
@@ -222,8 +221,7 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
; CHECK-NEXT: subs x8, x0, x2
; CHECK-NEXT: sbc x9, x1, x3
; CHECK-NEXT: subs x10, x2, x0
-; CHECK-NEXT: sbc x11, x3, x1
-; CHECK-NEXT: sbcs xzr, x3, x1
+; CHECK-NEXT: sbcs x11, x3, x1
; CHECK-NEXT: csel x8, x8, x10, lt
; CHECK-NEXT: csel x9, x9, x11, lt
; CHECK-NEXT: negs x0, x8
@@ -389,14 +387,12 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
; CHECK-LABEL: abd_cmp_i128:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmp x0, x2
-; CHECK-NEXT: sbc x8, x1, x3
-; CHECK-NEXT: subs x9, x2, x0
-; CHECK-NEXT: sbc x10, x3, x1
-; CHECK-NEXT: subs x11, x0, x2
-; CHECK-NEXT: sbcs xzr, x1, x3
-; CHECK-NEXT: csel x0, x11, x9, lt
-; CHECK-NEXT: csel x1, x8, x10, lt
+; CHECK-NEXT: subs x8, x2, x0
+; CHECK-NEXT: sbc x9, x3, x1
+; CHECK-NEXT: subs x10, x0, x2
+; CHECK-NEXT: sbcs x11, x1, x3
+; CHECK-NEXT: csel x0, x10, x8, lt
+; CHECK-NEXT: csel x1, x11, x9, lt
; CHECK-NEXT: ret
%cmp = icmp slt i128 %a, %b
%ab = sub i128 %a, %b
diff --git a/llvm/test/CodeGen/AArch64/abds.ll b/llvm/test/CodeGen/AArch64/abds.ll
index 62db30f..ed1e607 100644
--- a/llvm/test/CodeGen/AArch64/abds.ll
+++ b/llvm/test/CodeGen/AArch64/abds.ll
@@ -183,8 +183,7 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
; CHECK-NEXT: subs x8, x0, x2
; CHECK-NEXT: sbc x9, x1, x3
; CHECK-NEXT: subs x10, x2, x0
-; CHECK-NEXT: sbc x11, x3, x1
-; CHECK-NEXT: sbcs xzr, x3, x1
+; CHECK-NEXT: sbcs x11, x3, x1
; CHECK-NEXT: csel x0, x8, x10, lt
; CHECK-NEXT: csel x1, x9, x11, lt
; CHECK-NEXT: ret
@@ -202,8 +201,7 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
; CHECK-NEXT: subs x8, x0, x2
; CHECK-NEXT: sbc x9, x1, x3
; CHECK-NEXT: subs x10, x2, x0
-; CHECK-NEXT: sbc x11, x3, x1
-; CHECK-NEXT: sbcs xzr, x3, x1
+; CHECK-NEXT: sbcs x11, x3, x1
; CHECK-NEXT: csel x0, x8, x10, lt
; CHECK-NEXT: csel x1, x9, x11, lt
; CHECK-NEXT: ret
@@ -279,8 +277,7 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
; CHECK-NEXT: subs x8, x0, x2
; CHECK-NEXT: sbc x9, x1, x3
; CHECK-NEXT: subs x10, x2, x0
-; CHECK-NEXT: sbc x11, x3, x1
-; CHECK-NEXT: sbcs xzr, x3, x1
+; CHECK-NEXT: sbcs x11, x3, x1
; CHECK-NEXT: csel x0, x8, x10, lt
; CHECK-NEXT: csel x1, x9, x11, lt
; CHECK-NEXT: ret
@@ -358,8 +355,7 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
; CHECK-NEXT: subs x8, x0, x2
; CHECK-NEXT: sbc x9, x1, x3
; CHECK-NEXT: subs x10, x2, x0
-; CHECK-NEXT: sbc x11, x3, x1
-; CHECK-NEXT: sbcs xzr, x3, x1
+; CHECK-NEXT: sbcs x11, x3, x1
; CHECK-NEXT: csel x0, x8, x10, lt
; CHECK-NEXT: csel x1, x9, x11, lt
; CHECK-NEXT: ret
@@ -607,8 +603,7 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
; CHECK-NEXT: subs x8, x0, x2
; CHECK-NEXT: sbc x9, x1, x3
; CHECK-NEXT: subs x10, x2, x0
-; CHECK-NEXT: sbc x11, x3, x1
-; CHECK-NEXT: sbcs xzr, x3, x1
+; CHECK-NEXT: sbcs x11, x3, x1
; CHECK-NEXT: csel x0, x8, x10, lt
; CHECK-NEXT: csel x1, x9, x11, lt
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/abdu-neg.ll b/llvm/test/CodeGen/AArch64/abdu-neg.ll
index 2118816..8fb106e 100644
--- a/llvm/test/CodeGen/AArch64/abdu-neg.ll
+++ b/llvm/test/CodeGen/AArch64/abdu-neg.ll
@@ -391,14 +391,12 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
; CHECK-LABEL: abd_cmp_i128:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmp x0, x2
-; CHECK-NEXT: sbc x8, x1, x3
-; CHECK-NEXT: subs x9, x2, x0
-; CHECK-NEXT: sbc x10, x3, x1
-; CHECK-NEXT: subs x11, x0, x2
-; CHECK-NEXT: sbcs xzr, x1, x3
-; CHECK-NEXT: csel x0, x11, x9, lo
-; CHECK-NEXT: csel x1, x8, x10, lo
+; CHECK-NEXT: subs x8, x2, x0
+; CHECK-NEXT: sbc x9, x3, x1
+; CHECK-NEXT: subs x10, x0, x2
+; CHECK-NEXT: sbcs x11, x1, x3
+; CHECK-NEXT: csel x0, x10, x8, lo
+; CHECK-NEXT: csel x1, x11, x9, lo
; CHECK-NEXT: ret
%cmp = icmp ult i128 %a, %b
%ab = sub i128 %a, %b
diff --git a/llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll b/llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll
index 9dfc8df..9666c5c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll
@@ -136,3 +136,18 @@ entry:
%0 = load i64, ptr %arrayidx, align 8
ret i64 %0
}
+
+define <2 x i64> @loadv2i64_shr1(i64 %a, i64 %b, ptr %table) {
+; CHECK-LABEL: loadv2i64_shr1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mul x8, x1, x0
+; CHECK-NEXT: lsr x8, x8, #1
+; CHECK-NEXT: ldr q0, [x2, x8, lsl #4]
+; CHECK-NEXT: ret
+entry:
+ %mul = mul i64 %b, %a
+ %shr = lshr i64 %mul, 1
+ %arrayidx = getelementptr inbounds <2 x i64>, ptr %table, i64 %shr
+ %0 = load <2 x i64>, ptr %arrayidx, align 16
+ ret <2 x i64> %0
+}
diff --git a/llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir b/llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir
new file mode 100644
index 0000000..23ac67c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir
@@ -0,0 +1,98 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s
+
+
+---
+name: BSL_COPY
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+
+
+ ; CHECK-LABEL: name: BSL_COPY
+ ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $q2 = ORRv16i8 killed renamable $q20, killed renamable $q20
+ ; CHECK-NEXT: renamable $q2 = BSLv16i8 killed renamable $q2, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0
+ ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1
+ ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2
+ ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3
+ ; CHECK-NEXT: RET undef $lr, implicit $q22
+ renamable $q2 = BSPv16i8 killed renamable $q20, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ $q22 = ORRv16i8 $q0, killed $q0
+ $q23 = ORRv16i8 $q1, killed $q1
+ $q24 = ORRv16i8 $q2, killed $q2
+ $q25 = ORRv16i8 $q3, killed $q3
+ RET_ReallyLR implicit $q22
+...
+---
+name: BSL
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+
+ ; CHECK-LABEL: name: BSL
+ ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $q2 = BSLv16i8 killed renamable $q2, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0
+ ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1
+ ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2
+ ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3
+ ; CHECK-NEXT: RET undef $lr, implicit $q22
+ renamable $q2 = BSPv16i8 killed renamable $q2, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ $q22 = ORRv16i8 $q0, killed $q0
+ $q23 = ORRv16i8 $q1, killed $q1
+ $q24 = ORRv16i8 $q2, killed $q2
+ $q25 = ORRv16i8 $q3, killed $q3
+ RET_ReallyLR implicit $q22
+...
+---
+name: BIF
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+
+ ; CHECK-LABEL: name: BIF
+ ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $q2 = BIFv16i8 renamable $q2, renamable $q6, killed renamable $q20, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0
+ ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1
+ ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2
+ ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3
+ ; CHECK-NEXT: RET undef $lr, implicit $q22
+ renamable $q2 = BSPv16i8 killed renamable $q20, renamable $q2, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ $q22 = ORRv16i8 $q0, killed $q0
+ $q23 = ORRv16i8 $q1, killed $q1
+ $q24 = ORRv16i8 $q2, killed $q2
+ $q25 = ORRv16i8 $q3, killed $q3
+ RET_ReallyLR implicit $q22
+...
+---
+name: BIT
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+
+ ; CHECK-LABEL: name: BIT
+ ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $q2 = BITv16i8 renamable $q2, renamable $q21, killed renamable $q20, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0
+ ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1
+ ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2
+ ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3
+ ; CHECK-NEXT: RET undef $lr, implicit $q22
+ renamable $q2 = BSPv16i8 killed renamable $q20, renamable $q21, renamable $q2, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+ $q22 = ORRv16i8 $q0, killed $q0
+ $q23 = ORRv16i8 $q1, killed $q1
+ $q24 = ORRv16i8 $q2, killed $q2
+ $q25 = ORRv16i8 $q3, killed $q3
+ RET_ReallyLR implicit $q22
+...
diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
index 2b7fa08..e1ba0e9 100644
--- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll
+++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
@@ -1631,7 +1631,6 @@ define i8 @combine_i8_sdiv_const100(i8 %x) {
; CHECK-GI-NEXT: sxtb w8, w0
; CHECK-GI-NEXT: mov w9, #41 // =0x29
; CHECK-GI-NEXT: mul w8, w8, w9
-; CHECK-GI-NEXT: sxth w8, w8
; CHECK-GI-NEXT: sbfx w8, w8, #8, #8
; CHECK-GI-NEXT: asr w8, w8, #4
; CHECK-GI-NEXT: ubfx w9, w8, #7, #1
diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
new file mode 100644
index 0000000..1a83930
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+
+; load zero-extended i32, bitcast to f64
+define double @_Z9load_u64_from_u32_testPj(ptr %n){
+; CHECK-LABEL: _Z9load_u64_from_u32_testPj:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr s0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %0 = load i32, ptr %n, align 4
+ %conv = zext i32 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+; load zero-extended i16, bitcast to f64
+define double @_Z9load_u64_from_u16_testPj(ptr %n){
+; CHECK-LABEL: _Z9load_u64_from_u16_testPj:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %0 = load i16, ptr %n, align 2
+ %conv = zext i16 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+; load zero-extended i8, bitcast to f64
+define double @_Z16load_u64_from_u8Ph(ptr %n){
+; CHECK-LABEL: _Z16load_u64_from_u8Ph:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %0 = load i8, ptr %n, align 1
+ %conv = zext i8 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+; load zero-extended i16, bitcast to f32
+define float @_Z17load_u32_from_u16Pt(ptr %n){
+; CHECK-LABEL: _Z17load_u32_from_u16Pt:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %0 = load i16, ptr %n, align 2
+ %conv = zext i16 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+; load zero-extended i8, bitcast to f32
+define float @_Z16load_u32_from_u8Ph(ptr %n){
+; CHECK-LABEL: _Z16load_u32_from_u8Ph:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %0 = load i8, ptr %n, align 1
+ %conv = zext i8 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+; load zero-extended i8, bitcast to f16
+define half @_Z16load_u16_from_u8Ph(ptr %n){
+; CHECK-LABEL: _Z16load_u16_from_u8Ph:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %0 = load i8, ptr %n, align 1
+ %conv = zext i8 %0 to i16
+ %1 = bitcast i16 %conv to half
+ ret half %1
+}
+
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index b124042..c57383a 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -52,7 +52,6 @@ define i8 @si8_100(i8 %a, i8 %b) {
; CHECK-GI-NEXT: sxtb w8, w0
; CHECK-GI-NEXT: mov w9, #41 // =0x29
; CHECK-GI-NEXT: mul w8, w8, w9
-; CHECK-GI-NEXT: sxth w8, w8
; CHECK-GI-NEXT: sbfx w8, w8, #8, #8
; CHECK-GI-NEXT: asr w8, w8, #4
; CHECK-GI-NEXT: ubfx w9, w8, #7, #1
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging.ll b/llvm/test/CodeGen/AArch64/stack-tagging.ll
index 8759fb1..5d73c7b 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging.ll
@@ -143,54 +143,4 @@ l:
; CHECK-NOT: @llvm.aarch64.irg.sp
; CHECK: ret void
-; If we can't trace one of the lifetime markers to a single alloca, fall back
-; to poisoning all allocas at the beginning of the function.
-; Each alloca must be poisoned only once.
-define void @UnrecognizedLifetime(i8 %v) sanitize_memtag {
-entry:
- %x = alloca i32, align 4
- %y = alloca i32, align 4
- %z = alloca i32, align 4
- %tobool = icmp eq i8 %v, 0
- %xy = select i1 %tobool, ptr %x, ptr %y
- %cxcy = select i1 %tobool, ptr %x, ptr %y
- br label %another_bb
-
-another_bb:
- call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z)
- store i32 7, ptr %z
- call void @noUse32(ptr %z)
- call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z)
- call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z)
- store i32 7, ptr %z
- call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z)
- call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cxcy)
- store i32 8, ptr %xy
- call void @noUse32(ptr %x)
- call void @noUse32(ptr %y)
- call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cxcy)
- ret void
-}
-
-; CHECK-LABEL: define void @UnrecognizedLifetime(
-; CHECK: call ptr @llvm.aarch64.irg.sp(i64 0)
-; CHECK: alloca { i32, [12 x i8] }, align 16
-; CHECK: call ptr @llvm.aarch64.tagp
-; CHECK: call void @llvm.aarch64.settag(
-; CHECK: alloca { i32, [12 x i8] }, align 16
-; CHECK: call ptr @llvm.aarch64.tagp
-; CHECK: call void @llvm.aarch64.settag(
-; CHECK: alloca { i32, [12 x i8] }, align 16
-; CHECK: call ptr @llvm.aarch64.tagp
-; CHECK: call void @llvm.aarch64.settag(
-; CHECK: store i32
-; CHECK: call void @noUse32(ptr
-; CHECK: store i32
-; CHECK: store i32
-; CHECK: call void @noUse32(ptr
-; CHECK: call void @llvm.aarch64.settag(
-; CHECK: call void @llvm.aarch64.settag(
-; CHECK: call void @llvm.aarch64.settag(
-; CHECK: ret void
-
!0 = !{}
diff --git a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll
index 05abfa3..29e94dd6 100644
--- a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll
@@ -268,6 +268,20 @@ define <vscale x 2 x bfloat> @ld1_nxv2bf16(ptr %addr, i64 %off) {
ret <vscale x 2 x bfloat> %val
}
+; Ensure we don't lose the free shift when using indexed addressing.
+define <vscale x 2 x bfloat> @ld1_nxv2bf16_double_shift(ptr %addr, i64 %off) {
+; CHECK-LABEL: ld1_nxv2bf16_double_shift:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: lsr x8, x1, #6
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT: ret
+ %off2 = lshr i64 %off, 6
+ %ptr = getelementptr inbounds bfloat, ptr %addr, i64 %off2
+ %val = load volatile <vscale x 2 x bfloat>, ptr %ptr
+ ret <vscale x 2 x bfloat> %val
+}
+
; LD1W
define <vscale x 4 x i32> @ld1_nxv4i32(ptr %addr, i64 %off) {
@@ -327,6 +341,20 @@ define <vscale x 2 x float> @ld1_nxv2f32(ptr %addr, i64 %off) {
ret <vscale x 2 x float> %val
}
+; Ensure we don't lose the free shift when using indexed addressing.
+define <vscale x 2 x float> @ld1_nxv2f32_double_shift(ptr %addr, i64 %off) {
+; CHECK-LABEL: ld1_nxv2f32_double_shift:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: lsr x8, x1, #6
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
+; CHECK-NEXT: ret
+ %off2 = lshr i64 %off, 6
+ %ptr = getelementptr inbounds float, ptr %addr, i64 %off2
+ %val = load volatile <vscale x 2 x float>, ptr %ptr
+ ret <vscale x 2 x float> %val
+}
+
; LD1D
define <vscale x 2 x i64> @ld1_nxv2i64(ptr %addr, i64 %off) {
@@ -350,3 +378,17 @@ define <vscale x 2 x double> @ld1_nxv2f64(ptr %addr, i64 %off) {
%val = load volatile <vscale x 2 x double>, ptr %ptr
ret <vscale x 2 x double> %val
}
+
+; Ensure we don't lose the free shift when using indexed addressing.
+define <vscale x 2 x double> @ld1_nxv2f64_double_shift(ptr %addr, i64 %off) {
+; CHECK-LABEL: ld1_nxv2f64_double_shift:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: lsr x8, x1, #6
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
+; CHECK-NEXT: ret
+ %off2 = lshr i64 %off, 6
+ %ptr = getelementptr inbounds double, ptr %addr, i64 %off2
+ %val = load volatile <vscale x 2 x double>, ptr %ptr
+ ret <vscale x 2 x double> %val
+}
diff --git a/llvm/test/CodeGen/AArch64/urem-lkk.ll b/llvm/test/CodeGen/AArch64/urem-lkk.ll
index 2212e0a..0dd6685 100644
--- a/llvm/test/CodeGen/AArch64/urem-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/urem-lkk.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
define i32 @fold_urem_positive_odd(i32 %x) {
; CHECK-LABEL: fold_urem_positive_odd:
@@ -18,37 +19,54 @@ define i32 @fold_urem_positive_odd(i32 %x) {
ret i32 %1
}
-
define i32 @fold_urem_positive_even(i32 %x) {
-; CHECK-LABEL: fold_urem_positive_even:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #16323 // =0x3fc3
-; CHECK-NEXT: mov w9, #1060 // =0x424
-; CHECK-NEXT: movk w8, #63310, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #42
-; CHECK-NEXT: msub w0, w8, w9, w0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fold_urem_positive_even:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, #16323 // =0x3fc3
+; CHECK-SD-NEXT: mov w9, #1060 // =0x424
+; CHECK-SD-NEXT: movk w8, #63310, lsl #16
+; CHECK-SD-NEXT: umull x8, w0, w8
+; CHECK-SD-NEXT: lsr x8, x8, #42
+; CHECK-SD-NEXT: msub w0, w8, w9, w0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fold_urem_positive_even:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #16323 // =0x3fc3
+; CHECK-GI-NEXT: mov w9, #1060 // =0x424
+; CHECK-GI-NEXT: movk w8, #63310, lsl #16
+; CHECK-GI-NEXT: umull x8, w0, w8
+; CHECK-GI-NEXT: lsr x8, x8, #32
+; CHECK-GI-NEXT: lsr w8, w8, #10
+; CHECK-GI-NEXT: msub w0, w8, w9, w0
+; CHECK-GI-NEXT: ret
%1 = urem i32 %x, 1060
ret i32 %1
}
-
; Don't fold if we can combine urem with udiv.
define i32 @combine_urem_udiv(i32 %x) {
-; CHECK-LABEL: combine_urem_udiv:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8969 // =0x2309
-; CHECK-NEXT: movk w8, #22765, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #32
-; CHECK-NEXT: sub w9, w0, w8
-; CHECK-NEXT: add w8, w8, w9, lsr #1
-; CHECK-NEXT: mov w9, #95 // =0x5f
-; CHECK-NEXT: lsr w8, w8, #6
-; CHECK-NEXT: msub w9, w8, w9, w0
-; CHECK-NEXT: add w0, w9, w8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: combine_urem_udiv:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, #8969 // =0x2309
+; CHECK-SD-NEXT: movk w8, #22765, lsl #16
+; CHECK-SD-NEXT: umull x8, w0, w8
+; CHECK-SD-NEXT: lsr x8, x8, #32
+; CHECK-SD-NEXT: sub w9, w0, w8
+; CHECK-SD-NEXT: add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT: mov w9, #95 // =0x5f
+; CHECK-SD-NEXT: lsr w8, w8, #6
+; CHECK-SD-NEXT: msub w9, w8, w9, w0
+; CHECK-SD-NEXT: add w0, w9, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: combine_urem_udiv:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #95 // =0x5f
+; CHECK-GI-NEXT: udiv w9, w0, w8
+; CHECK-GI-NEXT: msub w8, w9, w8, w0
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: ret
%1 = urem i32 %x, 95
%2 = udiv i32 %x, 95
%3 = add i32 %1, %2
diff --git a/llvm/test/CodeGen/AArch64/wineh-reuse-catch-alloca.ll b/llvm/test/CodeGen/AArch64/wineh-reuse-catch-alloca.ll
deleted file mode 100644
index 18b8aab..0000000
--- a/llvm/test/CodeGen/AArch64/wineh-reuse-catch-alloca.ll
+++ /dev/null
@@ -1,100 +0,0 @@
-; RUN: llc %s --mtriple=aarch64-pc-windows-msvc -o - | FileCheck %s
-
-; Tests the fixed object layouts when two catchpads re-use the same stack
-; allocation for this catch objects.
-
-; Generated from this C++ code, with modifications to the IR (see comments in
-; IR):
-; https://godbolt.org/z/9qv5Yn68j
-; > clang --target=aarch64-pc-windows-msvc test.cpp
-; ```
-; extern "C" void boom();
-; extern "C" int calls_boom();
-; {
-; try { boom(); }
-; catch (int& i) { return i; }
-; catch (long& l) { return l; }
-; return 0;
-; }
-; ```
-
-; Only need 48 bytes on the stack, not 64.
-; CHECK-LABEL: calls_boom:
-; CHECK: sub sp, sp, #48
-; CHECK: .seh_stackalloc 48
-
-; Both the catch blocks load from the same address.
-; CHECK-LABEL: "?catch$3@?0?calls_boom@4HA":
-; CHECK: ldr x8, [x29, #24]
-; CHECK-LABEL: "?catch$4@?0?calls_boom@4HA":
-; CHECK: ldr x8, [x29, #24]
-
-; There's enough space for the UnwindHelp to be at -16 instead of -32
-; CHECK-LABEL: $cppxdata$calls_boom:
-; CHECK: .word -16 // UnwindHelp
-
-; Both catches have the same object offset.
-; CHECK-LABEL: $handlerMap$0$calls_boom:
-; CHECK: .word -8 // CatchObjOffset
-; CHECK-NEXT: .word "?catch$3@?0?calls_boom@4HA"@IMGREL // Handler
-; CHECK: .word -8 // CatchObjOffset
-; CHECK-NEXT: .word "?catch$4@?0?calls_boom@4HA"@IMGREL // Handler
-
-%rtti.TypeDescriptor2 = type { ptr, ptr, [3 x i8] }
-
-$"??_R0H@8" = comdat any
-
-$"??_R0J@8" = comdat any
-
-@"??_7type_info@@6B@" = external constant ptr
-@"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { ptr @"??_7type_info@@6B@", ptr null, [3 x i8] c".H\00" }, comdat
-@"??_R0J@8" = linkonce_odr global %rtti.TypeDescriptor2 { ptr @"??_7type_info@@6B@", ptr null, [3 x i8] c".J\00" }, comdat
-
-define dso_local i32 @calls_boom() personality ptr @__CxxFrameHandler3 {
-entry:
- %retval = alloca i32, align 4
-; MODIFICATION: Remove unusued alloca
-; %l = alloca ptr, align 8
- %i = alloca ptr, align 8
- invoke void @boom()
- to label %invoke.cont unwind label %catch.dispatch
-
-catch.dispatch:
- %0 = catchswitch within none [label %catch1, label %catch] unwind to caller
-
-catch1:
- %1 = catchpad within %0 [ptr @"??_R0H@8", i32 8, ptr %i]
- %2 = load ptr, ptr %i, align 8
- %3 = load i32, ptr %2, align 4
- store i32 %3, ptr %retval, align 4
- catchret from %1 to label %catchret.dest2
-
-catch:
-; MODIFICATION: Use %i instead of %l
- %4 = catchpad within %0 [ptr @"??_R0J@8", i32 8, ptr %i]
- %5 = load ptr, ptr %i, align 8
- %6 = load i32, ptr %5, align 4
- store i32 %6, ptr %retval, align 4
- catchret from %4 to label %catchret.dest
-
-invoke.cont:
- br label %try.cont
-
-catchret.dest:
- br label %return
-
-catchret.dest2:
- br label %return
-
-try.cont:
- store i32 0, ptr %retval, align 4
- br label %return
-
-return:
- %7 = load i32, ptr %retval, align 4
- ret i32 %7
-}
-
-declare dso_local void @boom() #1
-
-declare dso_local i32 @__CxxFrameHandler3(...)
diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index 840165d..4b53f66 100644
--- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FUNC,GFX7 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FUNC,GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=SI,FUNC,GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=SI,FUNC,GFX8 %s
; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and
; the global address space(1) uses 64-bit pointers. These tests check to make sure
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll
index a727ed3..b68df4f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=fiji < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s
; ===================================================================================
; V_ADD_LSHL_U32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index 38374d1..bbee880 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
define i32 @v_uaddo_i32(i32 %a, i32 %b) {
; GFX7-LABEL: v_uaddo_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
index 425dd8a..7c9e203 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-LABEL: s_add_u64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
index 6e4fb26..cdcc3a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s
define hidden <2 x i64> @icmp_v2i32_sext_to_v2i64(<2 x i32> %arg) {
; CHECK-LABEL: icmp_v2i32_sext_to_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index a91e41e..b84b31c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck %s
declare hidden ptr addrspace(1) @ext(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll
index 4618fc9..70cd963 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll
@@ -1,5 +1,5 @@
-; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
-; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: {{^}}atomic_load_monotonic_i8:
; GCN: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
index 28ed88f..65bc2d7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg)
declare i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll
index 04929852..dea42d6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll
@@ -1,5 +1,5 @@
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: {{^}}atomic_store_monotonic_i8:
; GCN: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index aeb3019..a86939f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
; FIXME: Merge with other test. DS offset folding doesn't work due to
; register bank copies, and no return optimization is missing.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index 788a4e6..7958e40 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
; FIXME: Merge with other test. DS offset folding doesn't work due to
; register bank copies, and no return optimization is missing.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
index 37fc0e0..62a5313 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,MOVREL %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define void @main(<19 x i32> %arg) {
; GCN-LABEL: main:
; GCN: ; %bb.0: ; %bb
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
index aba84cd..18895f7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
; End to end tests for scalar vs. vector boolean legalization strategies.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll
index 714328a..b1314dd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
; GFX908_GFX11-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll
index fb95d99..8567df0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_offset_rtn
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll
index 23931ac..59d60c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
define amdgpu_ps void @buffer_atomic_fadd_f64_offset_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_offset_no_rtn
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll
index 3ef735d..fbbb0de 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
; GFX908-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-rtn.ll
index 756f287..76e2fca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-rtn.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offset_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_offset_rtn
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll
index 20735bb..797e6ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck --check-prefix=GFX9 %s
define ptr @buffer_load_p0(ptr addrspace(8) inreg %buf) {
; GFX9-LABEL: name: buffer_load_p0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-schedule.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-schedule.ll
index e5aa822..96df689 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-schedule.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-schedule.ll
@@ -1,4 +1,4 @@
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s
; GCN-LABEL: {{^}}test1:
; GCN: buffer_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 7adaddf..6d2f253 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=MUBUF %s
-; RUN: llc -global-isel -mattr=+enable-flat-scratch -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=FLATSCR %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope -check-prefix=MUBUF %s
+; RUN: llc -global-isel -mattr=+enable-flat-scratch -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope -check-prefix=FLATSCR %s
; Test end-to-end codegen for outgoing arguments passed on the
; stack. This test is likely redundant when all DAG and GlobalISel
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
index ef88a2b..4fdc035 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefix=GFX12 %s
define float @test_fmed3_f32_known_nnan_ieee_true(float %a) #0 {
; GFX10-LABEL: test_fmed3_f32_known_nnan_ieee_true:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
index ab0de89..26b9d99 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefix=GFX12 %s
define float @test_min_max_ValK0_K1_f32(float %a) #0 {
; GFX10-LABEL: test_min_max_ValK0_K1_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
index ecf3b22..e71ab9f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 %s -o - | FileCheck -check-prefix=GCN %s
define amdgpu_cs float @div_sqrt(float inreg %arg1) {
; GCN-LABEL: div_sqrt:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll
index 40fc2fb..2d3088f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn < %s | FileCheck %s
define amdgpu_cs i32 @test_shl_1(i32 inreg %arg1) {
; CHECK-LABEL: test_shl_1:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll
index a36905c..5532443 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn < %s | FileCheck %s
define amdgpu_cs i32 @test_shl_and_1(i32 inreg %arg1) {
; CHECK-LABEL: test_shl_and_1:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
index 621394fd..adae3a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
@@ -1,7 +1,7 @@
-; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX678,GFX6789 %s
-; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX6789 %s
-; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
-; RUN: llc -global-isel -mcpu=gfx1100 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefixes=GCN,GFX678,GFX6789 %s
+; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefixes=GCN,GFX9,GFX6789 %s
+; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -mcpu=gfx1100 -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,GFX10 %s
declare i64 @llvm.smax.i64(i64, i64)
declare i64 @llvm.smin.i64(i64, i64)
@@ -31,7 +31,7 @@ entry:
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
; GFX10: v_cvt_pk_i16_i32{{(_e64)?}} [[A:v[0-9]+]], {{v[0-9]+}}, [[B:v[0-9]+]]
; GFX10: v_mov_b32_e32 [[B]], 0x7fff
-; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]]
+; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]]
define i16 @v_clamp_i64_i16_reverse(i64 %in) #0 {
entry:
%min = call i64 @llvm.smin.i64(i64 %in, i64 32767)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll
index b60f4c1..aceff55 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s
; Make sure we don't violate the constant bus restriction
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index e776413..94b956e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 11acd45..ff26ea2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
; Divergent phis that don't require lowering using lane mask merging
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index be90b02..a8a75cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; This file contains various tests that have divergent i1s used outside of
; the loop. These are lane masks is sgpr and need to have correct value in
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index e31077d..fd08ab8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
; Simples case, if - then, that requires lane mask merging,
; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
index 0da2526..d13d6a1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
; GFX10-LABEL: temporal_divergent_i1_phi:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
index 136f095..d4e5487 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
define void @temporal_divergent_i32(float %val, ptr %addr) {
; GFX10-LABEL: temporal_divergent_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 94dfd4e..6148bc2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
; Make sure the branch targets are correct after lowering llvm.amdgcn.if
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
index 6b767d9..8cb9a54 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
@gv = external addrspace(4) constant i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 573017f7..4fc0488 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -mattr=-xnack < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
; Check lowering of some large extractelement that use the stack
; instead of register indexing.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index c424738..3605dae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
index 63c3146..e4acee9d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
index e6a02c6..ac17dde 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 9b35920..e6e98fb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,MOVREL %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define float @dyn_extract_v8f32_const_s_v(i32 %sel) {
; GCN-LABEL: dyn_extract_v8f32_const_s_v:
@@ -3211,7 +3211,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_ieee_mode = 1
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
-; GFX10-NEXT: enable_fwd_progress = 0
+; GFX10-NEXT: enable_fwd_progress = 1
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX10-NEXT: user_sgpr_count = 14
; GFX10-NEXT: enable_trap_handler = 0
@@ -3303,7 +3303,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: enable_ieee_mode = 1
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
-; GFX11-NEXT: enable_fwd_progress = 0
+; GFX11-NEXT: enable_fwd_progress = 1
; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
@@ -4215,7 +4215,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_ieee_mode = 1
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
-; GFX10-NEXT: enable_fwd_progress = 0
+; GFX10-NEXT: enable_fwd_progress = 1
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX10-NEXT: user_sgpr_count = 14
; GFX10-NEXT: enable_trap_handler = 0
@@ -4300,7 +4300,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_ieee_mode = 1
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
-; GFX11-NEXT: enable_fwd_progress = 0
+; GFX11-NEXT: enable_fwd_progress = 1
; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
@@ -4569,7 +4569,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_ieee_mode = 1
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
-; GFX10-NEXT: enable_fwd_progress = 0
+; GFX10-NEXT: enable_fwd_progress = 1
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX10-NEXT: user_sgpr_count = 14
; GFX10-NEXT: enable_trap_handler = 0
@@ -4657,7 +4657,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_ieee_mode = 1
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
-; GFX11-NEXT: enable_fwd_progress = 0
+; GFX11-NEXT: enable_fwd_progress = 1
; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 870a748..1aee6ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -1,19 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Denormal mode shouldn't matter for f16, check with and without flushing.
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-IEEE %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-FLUSH %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GFX6,GFX6-IEEE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX6,GFX6-FLUSH %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8,GFX8-IEEE %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8,GFX8-FLUSH %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GFX89,GFX8,GFX8-IEEE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX89,GFX8,GFX8-FLUSH %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-IEEE %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-FLUSH %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-IEEE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-FLUSH %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-IEEE %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-FLUSH %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-IEEE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-FLUSH %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-IEEE %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FLUSH %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-IEEE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FLUSH %s
define half @v_fdiv_f16(half %a, half %b) {
; GFX6-IEEE-LABEL: v_fdiv_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index be894f2..3ea918e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -1,21 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-IEEE,GFX6-IEEE,GFX6-IEEE-FASTFMA %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH,GFX6-FLUSH,GFX6-FLUSH-FASTFMA %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GCN,GCN-IEEE,GFX6-IEEE,GFX6-IEEE-FASTFMA %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH,GFX6-FLUSH,GFX6-FLUSH-FASTFMA %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=pitcairn -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-IEEE,GFX6-IEEE,GFX6-IEEE-SLOWFMA %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=pitcairn -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH,GFX6-FLUSH,GFX6-FLUSH-SLOWFMA %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=pitcairn -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GCN,GCN-IEEE,GFX6-IEEE,GFX6-IEEE-SLOWFMA %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=pitcairn -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH,GFX6-FLUSH,GFX6-FLUSH-SLOWFMA %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-IEEE,GFX89-IEEE %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH,GFX89-FLUSH %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GCN,GCN-IEEE,GFX89-IEEE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH,GFX89-FLUSH %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-IEEE,GFX89-IEEE %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH,GFX89-FLUSH %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GCN,GCN-IEEE,GFX89-IEEE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH,GFX89-FLUSH %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-IEEE %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-FLUSH %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GFX10,GFX10-IEEE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX10,GFX10-FLUSH %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-IEEE %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FLUSH %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GFX11,GFX11-IEEE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX11,GFX11-FLUSH %s
define float @v_fdiv_f32(float %a, float %b) {
; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
index 8db1f46..ea149cc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
@@ -1,18 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX11 %s
define double @v_fdiv_f64(double %a, double %b) {
; GFX6-LABEL: v_fdiv_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
index 340e293..c4d57ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %data) {
; GFX942-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll
index 5909fe3..c349051 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX942 %s
define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX942-LABEL: name: flat_atomic_fadd_v2f16_rtn
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 8a80afd..b2a4c82 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -1,15 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX12 %s
define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX9-LABEL: store_load_sindex_kernel:
@@ -257,20 +257,16 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) {
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NEXT: v_mov_b32_e32 v2, 15
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
+; GFX12-NEXT: v_mov_b32_e32 v2, 15
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s0, s0, 7
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
-; GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS
+; GFX12-NEXT: scratch_store_b32 v0, v2, s0 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_endpgm
;
@@ -357,20 +353,16 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) {
; UNALIGNED_GFX12: ; %bb.0: ; %bb
; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
-; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0
; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7
-; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
-; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, s0 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
; UNALIGNED_GFX12-NEXT: s_endpgm
bb:
@@ -937,19 +929,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, 15
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
+; GFX12-NEXT: v_mov_b32_e32 v2, 15
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_lshl_b32 s0, s0, 7
; GFX12-NEXT: s_add_co_u32 s0, 0x100, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
-; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_endpgm
;
@@ -1048,19 +1038,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
-; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0
; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7
; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x100, s0
-; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
-; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
; UNALIGNED_GFX12-NEXT: s_endpgm
bb:
@@ -1579,19 +1567,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, 15
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
+; GFX12-NEXT: v_mov_b32_e32 v2, 15
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_lshl_b32 s0, s0, 7
; GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
-; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_endpgm
;
@@ -1692,19 +1678,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
-; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0
; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7
; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0
-; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
-; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
; UNALIGNED_GFX12-NEXT: s_endpgm
bb:
@@ -4060,9 +4044,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a
; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
+; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_endpgm
;
@@ -4113,9 +4095,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a
; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
; UNALIGNED_GFX12: ; %bb.0: ; %bb
; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
-; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
; UNALIGNED_GFX12-NEXT: s_endpgm
bb:
@@ -4172,9 +4152,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt
; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
+; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_endpgm
;
@@ -4223,9 +4201,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt
; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
; UNALIGNED_GFX12: ; %bb.0: ; %bb
; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
-; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
; UNALIGNED_GFX12-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
index cf0547e..d2c93e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefix=GFX12 %s
define float @test_min_max_ValK0_K1_f32(float %a) #0 {
; GFX10-LABEL: test_min_max_ValK0_K1_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 63009bd..8192d4a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX90A
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX942
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s -check-prefix=GFX90A
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefix=GFX942
declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index e4e6c44..eafad58 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
-; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
+; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=CI %s
+; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=VI %s
define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
; CI-LABEL: frem_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
index 3cde30f..8c01bc7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - %s | FileCheck -enable-var-scope %s
; FIXME: Also test with a pre-gfx8 target.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
index 831ca4d78..c448d2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
@@ -1,4 +1,4 @@
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx900 -o - %s | FileCheck %s
@lds0 = addrspace(3) global [512 x float] poison
@lds1 = addrspace(3) global [256 x float] poison
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index d94bf3a..4ed1cb2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -o - %s | FileCheck %s
; Make sure the waterfall loop does not fail the verifier after regalloc fast
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll
index 0b0c7b7..9c38e1e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -global-isel-abort=2 -pass-remarks-missed='gisel*' -mtriple=amdgcn -mcpu=fiji -stop-after=irtranslator -verify-machineinstrs %s -o - 2>%t | FileCheck %s
+; RUN: llc -global-isel -global-isel-abort=2 -pass-remarks-missed='gisel*' -mtriple=amdgcn -mcpu=fiji -stop-after=irtranslator %s -o - 2>%t | FileCheck %s
; RUN: FileCheck -check-prefix=ERR %s < %t
; ERR: remark: <unknown>:0:0: unable to translate instruction: call: ' %sgpr = call <4 x i32> asm sideeffect "; def $0", "={s[8:12]}"()' (in function: return_type_is_too_big_vector)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm.ll
index 6515d25..6da689b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -global-isel -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -global-isel -o - %s | FileCheck %s
define i32 @test_sgpr_reg_class_constraint() nounwind {
; CHECK-LABEL: test_sgpr_reg_class_constraint:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index 9485376..3e16026 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s
; Check lowering of some large insertelement that use the stack
; instead of register indexing.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index 2eb7486..cae833b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
define amdgpu_ps void @insertelement_s_v2i16_s_s(ptr addrspace(4) inreg %ptr, i16 inreg %val, i32 inreg %idx) {
; GFX9-LABEL: insertelement_s_v2i16_s_s:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 1701a9c..fe7d421 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11 %s
define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8 inreg %val, i32 inreg %idx) {
; GFX9-LABEL: insertelement_s_v2i8_s_s:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
index 2971049..920d8fa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addrspace(1) %ptr.out) #0 {
; GCN-LABEL: v_insert_v64i32_37:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir
index 5b8c284..dde566d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir
@@ -1,6 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
---
name: bswap_i32_vv
@@ -19,6 +21,7 @@ body: |
; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16711935
; GFX7-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 [[S_MOV_B32_]], [[V_ALIGNBIT_B32_e64_1]], [[V_ALIGNBIT_B32_e64_]], implicit $exec
; GFX7-NEXT: S_ENDPGM 0, implicit [[V_BFI_B32_e64_]]
+ ;
; GFX8-LABEL: name: bswap_i32_vv
; GFX8: liveins: $vgpr0
; GFX8-NEXT: {{ $}}
@@ -26,6 +29,22 @@ body: |
; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051
; GFX8-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec
; GFX8-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]]
+ ;
+ ; GFX9-LABEL: name: bswap_i32_vv
+ ; GFX9: liveins: $vgpr0
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051
+ ; GFX9-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec
+ ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]]
+ ;
+ ; GFX10-LABEL: name: bswap_i32_vv
+ ; GFX10: liveins: $vgpr0
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051
+ ; GFX10-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = G_BSWAP %0
S_ENDPGM 0, implicit %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir
index 0a4cb3cc..fa95f33 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir
@@ -1,8 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s
---
@@ -24,6 +24,24 @@ body: |
; GCN-NEXT: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec
; GCN-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_e64_]]
;
+ ; GFX9-LABEL: name: fshr_s32
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX9-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]]
+ ;
+ ; GFX10-LABEL: name: fshr_s32
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX10-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]]
+ ;
; GFX11-LABEL: name: fshr_s32
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
index 4b0ff1b..d4b485a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc --global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=GFX11
-; RUN: llc --global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=GFX10
+; RUN: llc --global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -stop-after=irtranslator %s -o - | FileCheck %s --check-prefix=GFX11
+; RUN: llc --global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -stop-after=irtranslator %s -o - | FileCheck %s --check-prefix=GFX10
declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll
index 2e95011..0317ec2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=amdgcn -O0 -stop-after=irtranslator -global-isel %s -o - | FileCheck %s
declare void @llvm.amdgcn.s.sendmsg(i32 immarg, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
index f50d5f3..122b8fb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck -check-prefix=HSA-VI %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=fiji -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck -check-prefix=LEGACY-MESA-VI %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator %s -o - | FileCheck -check-prefix=HSA-VI %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=fiji -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator %s -o - | FileCheck -check-prefix=LEGACY-MESA-VI %s
define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind {
; HSA-VI-LABEL: name: i8_arg
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll
index a81ce31..4098f64 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -o - %s | FileCheck %s
; TODO: Could potentially insert it here
define void @arg_align_8(ptr addrspace(1) align 8 %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
index a12ee14..3e7a567 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
; Test that we don't insert code to pass implicit arguments we know
; the callee does not need.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
index 6e85ccb..33862de 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=GFX908 %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - %s | FileCheck -enable-var-scope -check-prefix=GFX908 %s
; Workitem IDs are passed to the kernel differently for gfx908
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
index 21cac11..c06af21 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
; amdgpu_gfx calling convention
declare hidden amdgpu_gfx void @external_gfx_void_func_void() #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index 96ee15f..736bc8b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator < %s | FileCheck -check-prefix=GCN %s
declare i1 @external_i1_func_void() #0
declare zeroext i1 @external_i1_zeroext_func_void() #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
index 2910d35..b5a87ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator < %s | FileCheck -check-prefix=GCN %s
declare hidden void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }), ptr addrspace(5) byval({ i8, i32 })) #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 92106d7..1af175a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -global-isel-abort=2 -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel -global-isel-abort=2 -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
declare hidden void @external_void_func_void() #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll
index aa63e59..f8a84bf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stop-after=irtranslator -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -stop-after=irtranslator -o - %s | FileCheck %s
@var = global i32 poison
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
index 3a31ab4..4f360ef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -stop-after=irtranslator %s -o - | FileCheck %s
define float @v_constained_fadd_f32_fpexcept_strict(float %x, float %y) #0 {
; CHECK-LABEL: name: v_constained_fadd_f32_fpexcept_strict
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll
index 9ec3c83..ee35e1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stop-after=irtranslator < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -stop-after=irtranslator < %s | FileCheck %s
define amdgpu_kernel void @system_one_as_acquire() {
; CHECK-LABEL: name: system_one_as_acquire
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 5d4f64f..d80f332 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -2,7 +2,7 @@
; Note update_mir_test_checks does not support generating checks for
; the frame info, so some functions have manually added stack object
; checks.
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -o - %s | FileCheck %s
; FIXME: pre-VI should have same ABI without legal i16 operations.
define void @void_func_empty_arg({} %arg0, i32 %arg1) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
index ac0d5ee..7faa43a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) {
; CHECK-LABEL: name: test_indirect_call_sgpr_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
index 96c9f40..fbec70d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -global-isel -stop-after=irtranslator -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -global-isel -stop-after=irtranslator -o - %s | FileCheck %s
define amdgpu_kernel void @asm_convergent() convergent{
; CHECK-LABEL: name: asm_convergent
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
index ec07b0b..3e44f33 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -simplify-mir -global-isel -mtriple=amdgcn -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -simplify-mir -global-isel -mtriple=amdgcn -stop-after=irtranslator %s -o - | FileCheck %s
; Check the flags set on the memory operands for loads determined to
; be constants by alias analysis.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
index b83b8a0..e469609 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -O0 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -O0 -stop-after=irtranslator %s -o - | FileCheck %s
; Size operand should be the minimum of the two pointer sizes.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-prefetch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-prefetch.ll
index b53610a..f74a7e6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-prefetch.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel -mtriple=amdgcn -verify-machineinstrs -stop-after=irtranslator < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -stop-after=irtranslator < %s | FileCheck %s
define void @prefetch_read(ptr %ptr) {
; CHECK-LABEL: name: prefetch_read
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
index 7a8e521..ffeb7c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stop-after=irtranslator < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -stop-after=irtranslator < %s | FileCheck %s
define ptr @ptrmask_flat_i64(ptr %ptr, i64 %mask) {
; CHECK-LABEL: name: ptrmask_flat_i64
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
index ca580d8..72c176d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; This is a copy of sibling-call.ll, but stops after the IRTranslator.
define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll
index d3a6f70..477fcec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
declare hidden void @external_void_func_void()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-zext-vec-index.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-zext-vec-index.ll
index b655f57..eeaf8ee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-zext-vec-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-zext-vec-index.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=amdgcn -O0 -stop-after=irtranslator -global-isel %s -o - | FileCheck %s
define i8 @f_i1_1() {
; CHECK-LABEL: name: f_i1_1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll
index d3bc661..e3b9250 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel < %s | FileCheck %s
; early-tailduplication deletes cycle exit block created by structurize-cfg
; that had exactly one predecessor. Now, new cycle exit block has two
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
index 859f7ef..e4135fa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck %s
; TODO: Replace with existing DAG tests
@lds_512_4 = internal unnamed_addr addrspace(3) global [128 x i32] poison, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
index b68cc98..cfbb429 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
@@ -1,11 +1,11 @@
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
; GCN-LABEL: test_local_misaligned_v2:
; GCN-DAG: ds_{{read2|load_2addr}}_b32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll
index 0b9f31e..82886ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
; FIXME: Merge with DAG test
@lds.external = external unnamed_addr addrspace(3) global [0 x i32]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
index 39dde4b..cabb37c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel < %s 2>&1 | FileCheck %s
; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel < %s 2>&1 | FileCheck %s
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
index 3b16c77..5ed84fd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10NSA %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10NSA %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10NSA %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10NSA %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) {
; GFX9-LABEL: name: atomic_swap_1d
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
index be3fe91..4f5f52b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
@@ -31,3 +31,33 @@ body: |
S_ENDPGM 0
...
+---
+name: memcpy_test_volatile
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; CHECK-LABEL: name: memcpy_test_volatile
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8))
+ ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8))
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32)
+ %6:_(s32) = G_CONSTANT i32 1
+ %7:_(s64) = G_ZEXT %6:_(s32)
+ G_MEMCPY %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8))
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
index a82ca30..0392aef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
@@ -31,3 +31,33 @@ body: |
S_ENDPGM 0
...
+---
+name: memcpyinline_test_volatile
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; CHECK-LABEL: name: memcpyinline_test_volatile
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8))
+ ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8))
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32)
+ %6:_(s32) = G_CONSTANT i32 1
+ %7:_(s64) = G_ZEXT %6:_(s32)
+ G_MEMCPY_INLINE %2:_(p0), %5:_(p0), %7:_(s64) :: (volatile store (s8)), (volatile load (s8))
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
index e7cfaab..1f8d1aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
@@ -31,3 +31,33 @@ body: |
S_ENDPGM 0
...
+---
+name: memmove_test_volatile
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; CHECK-LABEL: name: memmove_test_volatile
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8))
+ ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8))
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32)
+ %6:_(s32) = G_CONSTANT i32 1
+ %7:_(s64) = G_ZEXT %6:_(s32)
+ G_MEMMOVE %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8))
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
index 021cebb..dda94e15 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
@@ -30,3 +30,32 @@ body: |
S_ENDPGM 0
...
+---
+name: memset_test_volatile
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: memset_test_volatile
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s8) = COPY [[TRUNC]](s8)
+ ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[MV]](p0) :: (volatile store (s8))
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s16) = G_TRUNC %3:_(s32)
+ %5:_(s8) = G_TRUNC %4:_(s16)
+ %6:_(s32) = G_CONSTANT i32 1
+ %7:_(s64) = G_ZEXT %6:_(s32)
+ G_MEMSET %2:_(p0), %5:_(s8), %7:_(s64), 0 :: (volatile store (s8))
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir
index cd69104..69e3561 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir
@@ -80,8 +80,7 @@ body: |
; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG]], [[SEXT_INREG1]]
; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[MUL]], [[C]](s32)
- ; GFX8-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16
- ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG2]](s32)
+ ; GFX8-NEXT: $vgpr0 = COPY [[ASHR]](s32)
;
; GFX9-LABEL: name: test_smulh_s16
; GFX9: liveins: $vgpr0, $vgpr1
@@ -93,8 +92,7 @@ body: |
; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG]], [[SEXT_INREG1]]
; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[MUL]], [[C]](s32)
- ; GFX9-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16
- ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG2]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ASHR]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s16) = G_TRUNC %0
@@ -200,9 +198,7 @@ body: |
; GFX9-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV3]], 16
; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG2]], [[SEXT_INREG3]]
; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[MUL1]], [[C]](s32)
- ; GFX9-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16
- ; GFX9-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR1]], 16
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG4]](s32), [[SEXT_INREG5]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ASHR]](s32), [[ASHR1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 7ec27f4..7916267 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -o - < %s | FileCheck %s --check-prefixes=GFX,GFX6
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -o - < %s | FileCheck %s --check-prefixes=GFX,GFX8
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -o - < %s | FileCheck %s --check-prefixes=GFX,GFX10
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -o - < %s | FileCheck %s --check-prefixes=GFX,GFX6
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -o - < %s | FileCheck %s --check-prefixes=GFX,GFX8
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX10
declare i16 @llvm.abs.i16(i16, i1)
declare i32 @llvm.abs.i32(i32, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 618dd45..5171403 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -global-isel < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
declare i32 @llvm.amdgcn.ballot.i32(i1)
declare i32 @llvm.ctpop.i32(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 0bbb40b..7b01f13 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel < %s | FileCheck %s
declare i64 @llvm.amdgcn.ballot.i64(i1)
declare i64 @llvm.ctpop.i64(i64)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll
index d165fb5..79760ce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s
; FIXME: Error on non-HSA target
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index ce19559..0535394 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
index 1e86f08..85c1d3a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
; GFX10-LABEL: test_wave32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
index e928f3f..3a0ef12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) {
; GCN-LABEL: test_wave64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
index 1d9514c..cd8ce7a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX10PLUS %s
define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll
index 67ec5cb..5d85a96 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX101 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX103 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX101 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefix=GFX103 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
define float @v_mul_legacy_f32(float %a, float %b) {
; GFX6-LABEL: v_mul_legacy_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index d0d4f4b..70bfb2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
; GFX10-LABEL: global_atomic_csub:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
index 0bf2376..ce8cba2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
; GFX10-LABEL: test_wave32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
index a5a75f7..973a76a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=hawaii < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) {
; GCN-LABEL: test_wave64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll
index 94dc519..3183378 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i16 %mip) {
; GFX9-LABEL: getresinfo_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll
index 496f9f4..a3c507b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) {
; GFX6-LABEL: getresinfo_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
index 19b0057..85ab4c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga < %s | FileCheck -check-prefix=GFX8-UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 < %s | FileCheck -check-prefix=GFX8-PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps half @load_1d_f16_x(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-LABEL: load_1d_f16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll
index ecf81f6..fc48664 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX68 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GFX68 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefix=NOPRT %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX68 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefix=GFX68 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=-enable-prt-strict-null < %s | FileCheck -check-prefix=NOPRT %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps float @load_1d_f32_x(<8 x i32> inreg %rsrc, i32 %s) {
; GFX68-LABEL: load_1d_f32_x:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
index fb4c923..2d0d04e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GFX6-LABEL: load_2d_v4f32_xyzw:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
index ce121c4..676bd88 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) {
; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
index 11ad98a..a101a15 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
; GFX6-LABEL: load_2darraymsaa_v4f32_xyzw:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
index 494c524..b20dc4b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %r) {
; GFX9-LABEL: load_3d_v4f32_xyzw:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
index 162a586..7f32d8e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
; GFX6-LABEL: load_3d_v4f32_xyzw:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll
index a39d7ae..159d1e3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_cd_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
index f03dce0..86e2d71 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_d_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.ptr.buffer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.ptr.buffer.ll
index 7d693d8..8d9f9d1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.ptr.buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.ptr.buffer.ll
@@ -1,4 +1,4 @@
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d < %s | FileCheck -check-prefix=GCN %s
; FIXME: Dropped parts from original test
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
index 1813003..a097032 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
; GFX11-LABEL: v_interp_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.p1.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.p1.f16.ll
index 92a0dd5..780e036 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.p1.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.p1.f16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-32BANK %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8-32BANK %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8-16BANK %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-32BANK %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GFX8-32BANK %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx810 < %s | FileCheck -check-prefixes=GFX8-16BANK %s
define amdgpu_ps float @interp_f16(float %i, i32 inreg %m0) #0 {
; GFX9-32BANK-LABEL: interp_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 0bcf52a..2b595b9b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1030 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1013 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: not llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1030 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1013 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1013 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: not llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
index 2707c91..ee9cf0b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -1,6 +1,6 @@
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V4,HSA,ALL %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V4,OS-MESA3D,ALL %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-unknown -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=OS-UNKNOWN,ALL %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=CO-V4,HSA,ALL %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefixes=CO-V4,OS-MESA3D,ALL %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-unknown -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefixes=OS-UNKNOWN,ALL %s
; ALL-LABEL: {{^}}test:
; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll
index 3bf5559..dd5a9ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=instruction-select < %s | FileCheck %s
define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) {
; CHECK-LABEL: name: basic_raw_buffer
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index 3f5a99c..393a462 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
index 76e56d9..90e2840 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -show-mc-encoding < %s | FileCheck -check-prefix=GFX11 %s
; FIXME: Merge with DAG test
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll
index dd351e1..0467547 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s
; FIXME: Error on non-hsa target
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
index 835fb46..62f8f89 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=GFX12 %s
; Natural mapping
define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
index 4973129..364ed62 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=GFX12 %s
; Natural mapping
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll
index d3cc70a..c6dd229 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll
@@ -1,5 +1,5 @@
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
-; RUN: not llc -global-isel < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
+; RUN: not llc -global-isel < %s -mtriple=amdgcn -mcpu=gfx908 2>&1 | FileCheck %s -check-prefix=GFX908
declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg)
declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll
index b1846b8..39737bf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX908
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX90A
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -o - %s | FileCheck %s -check-prefix=GFX908
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -o - %s | FileCheck %s -check-prefix=GFX90A
; Natural mapping
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
index 1977712..498ddfc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=instruction-select -o - %s | FileCheck -check-prefix=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
; Natural mapping
define amdgpu_ps half @raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
index f098350..feaf7ce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12 %s
; Natural mapping
define amdgpu_ps float @raw_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
index 8e167b9..46ca43b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12 %s
; FIXME: Test with SI when argument lowering not broken for f16
; Natural mapping
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
index b4bf05fd..3fbfb63 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX6
-; RUN: llc -global-isel -mcpu=hawaii -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX7
-; RUN: llc -global-isel -mcpu=fiji -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX8
-; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX910
-; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX910
-; RUN: llc -global-isel -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11
-; RUN: llc -global-isel -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12
+; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX67,GFX6
+; RUN: llc -global-isel -mcpu=hawaii -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX67,GFX7
+; RUN: llc -global-isel -mcpu=fiji -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX8
+; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX910
+; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX910
+; RUN: llc -global-isel -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX11
+; RUN: llc -global-isel -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX12
define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
; GFX67-LABEL: name: raw_buffer_load_i8_tfe
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
index 8160ba4..63ca7be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
; FIXME: Test with SI when argument lowering not broken for f16
; Natural mapping
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll
index d7844c5..7760a8d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @raw_ptr_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll
index 3852a02..229a593 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @raw_ptr_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll
index ac23cbf..bcc1e49 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll
@@ -1,5 +1,5 @@
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
-; RUN: not llc -global-isel < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
+; RUN: not llc -global-isel < %s -mtriple=amdgcn -mcpu=gfx908 2>&1 | FileCheck %s -check-prefix=GFX908
declare float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32 immarg)
declare <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll
index 42c0749..ac73232 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX908
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX90A
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -o - %s | FileCheck %s -check-prefix=GFX908
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -o - %s | FileCheck %s -check-prefix=GFX90A
; Natural mapping
define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll
index cf059da..13f9cce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=instruction-select -o - %s | FileCheck -check-prefix=UNPACKED %s
; Natural mapping
define amdgpu_ps half @raw_ptr_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll
index d9c6167..636ba9b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @raw_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll
index 0625981..89c3a41 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
; FIXME: Test with SI when argument lowering not broken for f16
; Natural mapping
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll
index ec0bd1f..a15b34d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
; FIXME: Test with SI when argument lowering not broken for f16
; Natural mapping
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll
index cb4fd29..9d8f47a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=PACKED %s
define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; UNPACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll
index 615543c..4d7d3ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -o - %s | FileCheck %s
define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; CHECK-LABEL: name: raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll
index 99bc50e..12c6029 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=PACKED %s
define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; UNPACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll
index cc70c27..3a43ecf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=PACKED %s
define amdgpu_ps void @raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i8 %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; UNPACKED-LABEL: name: raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll
index 5092060..15b3124 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
index 0850fdf..50b3387 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; UNPACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
index f6670ba..0ae2833 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; GFX10_GFX11-LABEL: name: raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll
index cb622d2..977d7d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; UNPACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll
index 1e61db7..9de5b67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps void @raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i8 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; UNPACKED-LABEL: name: raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll
index 8d82772..91706ff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
; Natural mapping
define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
index 24fe2d1..50377e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
define float @v_rsq_clamp_f32(float %src) #0 {
; SI-LABEL: v_rsq_clamp_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
index daa1923..ca0e190 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
; FIXME: Merge with regbankselect, which mostly overlaps when all types supported.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
index 7d08458..7052d08 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=verde -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=verde -show-mc-encoding < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s
; FIXME: This test has a DAG duplicate
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.sleep.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.sleep.ll
index a370408..a0a946c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.sleep.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.sleep.ll
@@ -1,5 +1,5 @@
-; RUN: llc -global-isel -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
declare void @llvm.amdgcn.s.sleep(i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
index 45bade2..b2f3e5e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=GFX6 %s
define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 {
; GFX6-LABEL: v_bfe_i32_arg_arg_arg:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
index 8f0ae8c..16babfe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX908 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefix=GFX908 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10 %s
define i32 @v_sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index 06560af..cf835a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10 %s
define i32 @v_sdot4(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
index 0d72935..4dbcffe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10 %s
define i32 @v_sdot8(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot8:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 2c44d71..e411c23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.softwqm.ll
index 200d38a..1915338 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.softwqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.softwqm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -stop-after=instruction-select < %s | FileCheck -check-prefix=GCN %s
define amdgpu_ps float @softwqm_f32(float %val) {
; GCN-LABEL: name: softwqm_f32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
index abee7de..75d6c59 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
; Natural mapping
define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
index 1c00ffb..c9d1227 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
; Natural mapping
define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll
index 1b21af8..5a6c5a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll
@@ -1,5 +1,5 @@
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
-; RUN: not llc -global-isel < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
+; RUN: not llc -global-isel < %s -mtriple=amdgcn -mcpu=gfx908 2>&1 | FileCheck %s -check-prefix=GFX908
; GFX908: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %{{[0-9]+}}:vgpr, %{{[0-9]+}}:sgpr(<4 x s32>), %{{[0-9]+}}:vgpr(s32), %{{[0-9]+}}:vgpr, %{{[0-9]+}}:sgpr, 0, 0, -1 :: (volatile dereferenceable load store (s32), align 1, addrspace 8) (in function: buffer_atomic_add_f32_rtn)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll
index c002764..7b59ce1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX908
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX90A
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -o - %s | FileCheck %s -check-prefix=GFX908
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -o - %s | FileCheck %s -check-prefix=GFX90A
; Natural mapping
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
index 98a2780..aea128e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps half @struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; UNPACKED-LABEL: name: struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
index c2ab42b..c164144 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
; Note that TFE instructions don't have the result initialization to zero due to stopping before finalize-isel - which is where that's inserted
define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
index 588b020..9b5e46b3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
; Natural mapping
define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll
index de9bffe..674fe1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX6
-; RUN: llc -global-isel -mcpu=hawaii -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX7
-; RUN: llc -global-isel -mcpu=fiji -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX8
-; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX910
-; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX910
-; RUN: llc -global-isel -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11
-; RUN: llc -global-isel -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12
+; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX67,GFX6
+; RUN: llc -global-isel -mcpu=hawaii -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX67,GFX7
+; RUN: llc -global-isel -mcpu=fiji -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX8
+; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX910
+; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX910
+; RUN: llc -global-isel -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX11
+; RUN: llc -global-isel -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX12
define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
; GFX67-LABEL: name: raw_buffer_load_i8_tfe
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll
index 6923810..bd6c141 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps void @struct_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(half %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; UNPACKED-LABEL: name: struct_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
index 210c3bb..8183d85 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
; Natural mapping
define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll
index cc937f4..968e2ba 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @struct_ptr_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll
index fb67dda..117fec3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @struct_ptr_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd-with-ret.ll
index a71e7eb..a6767c6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd-with-ret.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd-with-ret.ll
@@ -1,5 +1,5 @@
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
-; RUN: not llc -global-isel < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
+; RUN: not llc -global-isel < %s -mtriple=amdgcn -mcpu=gfx908 2>&1 | FileCheck %s -check-prefix=GFX908
; GFX908: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD %{{[0-9]+}}:vgpr, %{{[0-9]+}}:sgpr(<4 x s32>), %{{[0-9]+}}:vgpr(s32), %{{[0-9]+}}:vgpr, %{{[0-9]+}}:sgpr, 0, 0, -1 :: (volatile dereferenceable load store (s32) on %ir.rsrc.load, align 1, addrspace 8) (in function: buffer_atomic_add_f32_rtn)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll
index 18568aa..0c7f471 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX908
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX90A
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -o - %s | FileCheck %s -check-prefix=GFX908
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -o - %s | FileCheck %s -check-prefix=GFX90A
; Natural mapping
define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll
index bc4bd34..30ce367 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=PACKED %s
define amdgpu_ps half @struct_ptr_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; UNPACKED-LABEL: name: struct_ptr_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll
index caaa765..4c59812 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
; Note that TFE instructions don't have the result initialization to zero due to stopping before finalize-isel - which is where that's inserted
define amdgpu_ps float @struct_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll
index 95789b5..4ae456d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @struct_ptr_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll
index fe2b048..e811d33 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=PACKED %s
define amdgpu_ps void @struct_ptr_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(half %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; UNPACKED-LABEL: name: struct_ptr_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll
index a18d0c2..f331e29 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps void @struct_ptr_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll
index cae9448..49918e6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=UNPACKED %s
define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll
index b08b46f..d644ef9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; CHECK-LABEL: name: struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
index 87c1e7b..3c22f35 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX12 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=UNPACKED %s
define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
index 23468c2..7c811f4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK-GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK-GFX12 %s
define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; CHECK-LABEL: name: struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll
index 65ecaa1..1bfec2b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck %s --check-prefixes=GFX11
declare i32 @llvm.amdgcn.sudot4(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll
index 92bad5e..8b379f4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck %s --check-prefixes=GFX11
declare i32 @llvm.amdgcn.sudot8(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
index d327c15..3319ca1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=GFX6 %s
define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 {
; GFX6-LABEL: v_bfe_i32_arg_arg_arg:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
index 287a009..8204f86 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX908 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefix=GFX908 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10 %s
define i32 @v_udot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
index b14af9e..eeedc08 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s
define i32 @v_udot4(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
index a664c8a..df90085 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX10PLUS %s
define i32 @v_udot8(i32 %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot8:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 41f57bb..e5d9884 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-dpp-combine=false < %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
; GFX8-LABEL: dpp_test:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
index 603eb88..57d3db4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefix=W32
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
index 7deaca4..c0d983a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=W64
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
index d564682..66cdfc2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
@@ -1,14 +1,14 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v4.ll
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v6.ll
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -verify-machineinstrs -amdgpu-enable-vopd=0 < %t.v6.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -amdgpu-enable-vopd=0 < %t.v6.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.workitem.id.y() #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
index e79177c..8a53c86 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10-32 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX10-64 %s
define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
; SI-LABEL: static_exact:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.ll
index edc93f4..a25e1f2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -stop-after=instruction-select < %s | FileCheck -check-prefix=GCN %s
define amdgpu_ps float @wqm_f32(float %val) {
; GCN-LABEL: name: wqm_f32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll
index 17f3dd7..521300b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s
define amdgpu_ps float @test_writelane_s_s_s(i32 inreg %data, i32 inreg %lane, i32 inreg %vdst.in) #0 {
; GFX7-LABEL: test_writelane_s_s_s:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wwm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wwm.ll
index bf48683..9201de5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wwm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wwm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -stop-after=instruction-select < %s | FileCheck -check-prefix=GCN %s
; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
index dfc9995..7c0484b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
declare void @llvm.memcpy.inline.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index e8de761..e0016b0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memcpy.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
index de9af52..d5cd7c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memmove.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
index 7cd3bab..04652af 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
index 21f1af1..caaface 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
; FIXME:
-; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
+; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
define <4 x i32> @load_lds_v4i32(ptr addrspace(3) %ptr) {
; GFX9-LABEL: load_lds_v4i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
index 67a089b..cbfdfd3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
; FIXME:
-; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
+; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
define <3 x i32> @load_lds_v3i32(ptr addrspace(3) %ptr) {
; GFX9-LABEL: load_lds_v3i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index cea848e..ed248b4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX11 %s
; Unaligned DS access in available from GFX9 onwards.
; LDS alignment enforcement is controlled by a configuration register:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
new file mode 100644
index 0000000..92e532b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode < %s | FileCheck %s
+
+define amdgpu_ps void @uniform_load_i32(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1, ptr addrspace(1) inreg %ptr2) {
+; CHECK-LABEL: uniform_load_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_load_dword v2, v0, s[2:3]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s1, v2
+; CHECK-NEXT: s_add_i32 s0, s0, s1
+; CHECK-NEXT: v_mov_b32_e32 v1, s0
+; CHECK-NEXT: global_store_dword v0, v1, s[4:5]
+; CHECK-NEXT: s_endpgm
+ %load0 = load volatile i32, ptr addrspace(1) %ptr0
+ %load1 = load i32, ptr addrspace(1) %ptr1, align 1
+ %sum = add i32 %load0, %load1
+ store i32 %sum, ptr addrspace(1) %ptr2
+ ret void
+}
+
+define amdgpu_ps void @uniform_load_v2i32(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; CHECK-LABEL: uniform_load_v2i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: v_readfirstlane_b32 s1, v1
+; CHECK-NEXT: s_add_i32 s0, s0, s1
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
+; CHECK-NEXT: s_endpgm
+ %load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
+ %elt0 = extractelement <2 x i32> %load, i32 0
+ %elt1 = extractelement <2 x i32> %load, i32 1
+ %sum = add i32 %elt0, %elt1
+ store i32 %sum, ptr addrspace(1) %ptr1
+ ret void
+}
+
+define amdgpu_ps void @uniform_load_v3i32(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; CHECK-LABEL: uniform_load_v3i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v3, 0
+; CHECK-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: v_readfirstlane_b32 s1, v1
+; CHECK-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-NEXT: s_add_i32 s0, s0, s1
+; CHECK-NEXT: s_add_i32 s0, s0, s4
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: global_store_dword v3, v0, s[2:3]
+; CHECK-NEXT: s_endpgm
+ %load = load <3 x i32>, ptr addrspace(1) %ptr0, align 2
+ %elt0 = extractelement <3 x i32> %load, i32 0
+ %elt1 = extractelement <3 x i32> %load, i32 1
+ %elt2 = extractelement <3 x i32> %load, i32 2
+ %sum0 = add i32 %elt0, %elt1
+ %sum = add i32 %sum0, %elt2
+ store i32 %sum, ptr addrspace(1) %ptr1
+ ret void
+}
+
+define amdgpu_ps void @uniform_load_v4i32(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; CHECK-LABEL: uniform_load_v4i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] glc dlc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: v_readfirstlane_b32 s1, v1
+; CHECK-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-NEXT: s_add_i32 s0, s0, s1
+; CHECK-NEXT: s_add_i32 s0, s0, s4
+; CHECK-NEXT: s_add_i32 s0, s0, s5
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: global_store_dword v4, v0, s[2:3]
+; CHECK-NEXT: s_endpgm
+ %load = load volatile <4 x i32>, ptr addrspace(1) %ptr0
+ %elt0 = extractelement <4 x i32> %load, i32 0
+ %elt1 = extractelement <4 x i32> %load, i32 1
+ %elt2 = extractelement <4 x i32> %load, i32 2
+ %elt3 = extractelement <4 x i32> %load, i32 3
+ %sum0 = add i32 %elt0, %elt1
+ %sum1 = add i32 %sum0, %elt2
+ %sum = add i32 %sum1, %elt3
+ store i32 %sum, ptr addrspace(1) %ptr1
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
index 9e58b71..dc782aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -o - %s | FileCheck %s
define amdgpu_cs void @test1(i32 %arg1, <4 x i32> inreg %arg2, i32, ptr addrspace(6) inreg %arg3) {
; CHECK-LABEL: test1:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index c87c334..1cd9c0b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX11 %s
declare i32 @llvm.amdgcn.workitem.id.x()
; A 64-bit multiplication where no arguments were zero extended.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index a224c8b..6cc192c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -1,11 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16, -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16, -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16, -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16, -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s
define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
; GCN-LABEL: s_mul_i16:
@@ -22,6 +23,11 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mul_i32 s0, s0, s1
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s0, s0, s1
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
@@ -74,6 +80,13 @@ define i16 @v_mul_i16(i16 %num, i16 %den) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
@@ -109,6 +122,13 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, 0xffff, s0
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i16_zeroext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
@@ -165,6 +185,15 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i16_zeroext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
@@ -188,6 +217,13 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_sext_i32_i16 s0, s0
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i16_signext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_sext_i32_i16 s0, s0
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
@@ -248,6 +284,15 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i16_signext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
@@ -267,6 +312,11 @@ define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mul_i32 s0, s0, s1
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s0, s0, s1
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i32 %num, %den
ret i32 %result
}
@@ -293,6 +343,13 @@ define i32 @v_mul_i32(i32 %num, i32 %den) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i32 %num, %den
ret i32 %result
}
@@ -315,6 +372,12 @@ define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %d
; GFX12-NEXT: s_mul_i32 s0, s0, s2
; GFX12-NEXT: s_mul_i32 s1, s1, s3
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_v2i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s0, s0, s2
+; GFX1250-NEXT: s_mul_i32 s1, s1, s3
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
@@ -344,6 +407,14 @@ define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_v2i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v2
+; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
@@ -400,6 +471,11 @@ define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i33:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i33 %num, %den
ret i33 %result
}
@@ -456,6 +532,11 @@ define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i64 %num, %den
ret i64 %result
}
@@ -504,6 +585,13 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v2, v[3:4]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i64 %num, %den
ret i64 %result
}
@@ -620,6 +708,26 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX12-NEXT: s_add_co_ci_u32 s2, s3, s0
; GFX12-NEXT: s_mov_b32 s0, s5
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i96:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s6, s0, s5
+; GFX1250-NEXT: s_mul_i32 s7, s1, s4
+; GFX1250-NEXT: s_mul_i32 s2, s2, s3
+; GFX1250-NEXT: s_add_co_i32 s6, s6, s7
+; GFX1250-NEXT: s_mul_hi_u32 s7, s0, s3
+; GFX1250-NEXT: s_add_co_i32 s6, s6, s2
+; GFX1250-NEXT: s_mul_i32 s2, s0, s4
+; GFX1250-NEXT: s_mul_i32 s5, s0, s3
+; GFX1250-NEXT: s_mul_hi_u32 s0, s0, s4
+; GFX1250-NEXT: s_add_co_u32 s2, s2, s7
+; GFX1250-NEXT: s_mul_i32 s4, s1, s3
+; GFX1250-NEXT: s_add_co_ci_u32 s0, s0, s6
+; GFX1250-NEXT: s_mul_hi_u32 s3, s1, s3
+; GFX1250-NEXT: s_add_co_u32 s1, s4, s2
+; GFX1250-NEXT: s_add_co_ci_u32 s2, s3, s0
+; GFX1250-NEXT: s_mov_b32 s0, s5
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i96 %num, %den
%cast = bitcast i96 %result to <3 x i32>
ret <3 x i32> %cast
@@ -686,6 +794,25 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i96:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mul_lo_u32 v0, v6, v5
+; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
+; GFX1250-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v11, v8
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v6, v4, v[10:11]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v3, v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
}
@@ -895,6 +1022,42 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX12-NEXT: s_mov_b32 s1, s8
; GFX12-NEXT: s_mov_b32 s2, s7
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i128:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s9, s0, s6
+; GFX1250-NEXT: s_mul_i32 s11, s1, s5
+; GFX1250-NEXT: s_mul_hi_u32 s10, s0, s6
+; GFX1250-NEXT: s_mul_hi_u32 s12, s1, s5
+; GFX1250-NEXT: s_add_co_u32 s9, s11, s9
+; GFX1250-NEXT: s_mul_i32 s11, s2, s4
+; GFX1250-NEXT: s_add_co_ci_u32 s10, s12, s10
+; GFX1250-NEXT: s_mul_hi_u32 s12, s2, s4
+; GFX1250-NEXT: s_mul_hi_u32 s8, s0, s4
+; GFX1250-NEXT: s_add_co_u32 s9, s11, s9
+; GFX1250-NEXT: s_mul_i32 s11, s0, s5
+; GFX1250-NEXT: s_add_co_ci_u32 s10, s12, s10
+; GFX1250-NEXT: s_mul_hi_u32 s12, s0, s5
+; GFX1250-NEXT: s_add_co_u32 s8, s11, s8
+; GFX1250-NEXT: s_add_co_ci_u32 s9, s12, s9
+; GFX1250-NEXT: s_mul_i32 s12, s1, s4
+; GFX1250-NEXT: s_mul_hi_u32 s13, s1, s4
+; GFX1250-NEXT: s_cselect_b32 s11, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s8, s12, s8
+; GFX1250-NEXT: s_mul_i32 s12, s0, s7
+; GFX1250-NEXT: s_add_co_ci_u32 s7, s13, s9
+; GFX1250-NEXT: s_add_co_ci_u32 s9, s10, s12
+; GFX1250-NEXT: s_mul_i32 s1, s1, s6
+; GFX1250-NEXT: s_cmp_lg_u32 s11, 0
+; GFX1250-NEXT: s_mul_i32 s2, s2, s5
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s9, s1
+; GFX1250-NEXT: s_mul_i32 s3, s3, s4
+; GFX1250-NEXT: s_add_co_i32 s1, s1, s2
+; GFX1250-NEXT: s_mul_i32 s0, s0, s4
+; GFX1250-NEXT: s_add_co_i32 s3, s1, s3
+; GFX1250-NEXT: s_mov_b32 s1, s8
+; GFX1250-NEXT: s_mov_b32 s2, s7
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i128 %num, %den
%cast = bitcast i128 %result to <4 x i32>
ret <4 x i32> %cast
@@ -1036,6 +1199,37 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i128:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null, v9, v5, v[0:1]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null, v2, v4, v[10:11]
+; GFX1250-NEXT: v_mov_b32_e32 v12, v1
+; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v13, v10
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13]
+; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v8, v1, vcc_lo
+; GFX1250-NEXT: v_mov_b32_e32 v1, v6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v5, v[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v7
+; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v3, v4, v[8:9]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i128 %num, %den
ret i128 %result
}
@@ -2020,6 +2214,185 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX12-NEXT: s_add_co_i32 s7, s1, s7
; GFX12-NEXT: s_mov_b32 s1, s16
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: s_mul_i256:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mul_i32 s17, s0, s10
+; GFX1250-NEXT: s_mul_i32 s19, s1, s9
+; GFX1250-NEXT: s_mul_hi_u32 s18, s0, s10
+; GFX1250-NEXT: s_mul_hi_u32 s20, s1, s9
+; GFX1250-NEXT: s_add_co_u32 s17, s19, s17
+; GFX1250-NEXT: s_add_co_ci_u32 s18, s20, s18
+; GFX1250-NEXT: s_mul_i32 s20, s2, s8
+; GFX1250-NEXT: s_mul_hi_u32 s21, s2, s8
+; GFX1250-NEXT: s_cselect_b32 s19, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s17, s20, s17
+; GFX1250-NEXT: s_mul_hi_u32 s16, s0, s8
+; GFX1250-NEXT: s_add_co_ci_u32 s18, s21, s18
+; GFX1250-NEXT: s_mul_i32 s21, s0, s9
+; GFX1250-NEXT: s_mul_hi_u32 s22, s0, s9
+; GFX1250-NEXT: s_cselect_b32 s20, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s16, s21, s16
+; GFX1250-NEXT: s_add_co_ci_u32 s17, s22, s17
+; GFX1250-NEXT: s_mul_i32 s22, s1, s8
+; GFX1250-NEXT: s_mul_hi_u32 s23, s1, s8
+; GFX1250-NEXT: s_cselect_b32 s21, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s16, s22, s16
+; GFX1250-NEXT: s_add_co_ci_u32 s17, s23, s17
+; GFX1250-NEXT: s_mul_i32 s23, s0, s12
+; GFX1250-NEXT: s_mul_i32 s25, s1, s11
+; GFX1250-NEXT: s_mul_hi_u32 s24, s0, s12
+; GFX1250-NEXT: s_mul_hi_u32 s26, s1, s11
+; GFX1250-NEXT: s_cselect_b32 s22, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s25, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s24, s26, s24
+; GFX1250-NEXT: s_mul_i32 s26, s2, s10
+; GFX1250-NEXT: s_mul_hi_u32 s27, s2, s10
+; GFX1250-NEXT: s_cselect_b32 s25, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s26, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s24, s27, s24
+; GFX1250-NEXT: s_mul_i32 s27, s3, s9
+; GFX1250-NEXT: s_mul_hi_u32 s28, s3, s9
+; GFX1250-NEXT: s_cselect_b32 s26, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s27, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s24, s28, s24
+; GFX1250-NEXT: s_mul_i32 s28, s4, s8
+; GFX1250-NEXT: s_mul_hi_u32 s29, s4, s8
+; GFX1250-NEXT: s_cselect_b32 s27, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s28, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s24, s29, s24
+; GFX1250-NEXT: s_mul_i32 s29, s0, s11
+; GFX1250-NEXT: s_mul_hi_u32 s30, s0, s11
+; GFX1250-NEXT: s_cselect_b32 s28, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s18, s29, s18
+; GFX1250-NEXT: s_add_co_ci_u32 s23, s30, s23
+; GFX1250-NEXT: s_mul_i32 s30, s1, s10
+; GFX1250-NEXT: s_mul_hi_u32 s31, s1, s10
+; GFX1250-NEXT: s_cselect_b32 s29, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s18, s30, s18
+; GFX1250-NEXT: s_add_co_ci_u32 s23, s31, s23
+; GFX1250-NEXT: s_mul_i32 s31, s2, s9
+; GFX1250-NEXT: s_mul_hi_u32 s33, s2, s9
+; GFX1250-NEXT: s_cselect_b32 s30, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s18, s31, s18
+; GFX1250-NEXT: s_add_co_ci_u32 s23, s33, s23
+; GFX1250-NEXT: s_mul_i32 s33, s3, s8
+; GFX1250-NEXT: s_mul_hi_u32 s34, s3, s8
+; GFX1250-NEXT: s_cselect_b32 s31, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s18, s33, s18
+; GFX1250-NEXT: s_add_co_ci_u32 s23, s34, s23
+; GFX1250-NEXT: s_cselect_b32 s33, 1, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s22, 0
+; GFX1250-NEXT: s_mul_hi_u32 s22, s0, s14
+; GFX1250-NEXT: s_add_co_ci_u32 s18, s21, s18
+; GFX1250-NEXT: s_cselect_b32 s21, 1, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s20, 0
+; GFX1250-NEXT: s_mul_hi_u32 s34, s1, s13
+; GFX1250-NEXT: s_add_co_ci_u32 s19, s19, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s21, 0
+; GFX1250-NEXT: s_mul_i32 s21, s0, s14
+; GFX1250-NEXT: s_add_co_ci_u32 s19, s19, s23
+; GFX1250-NEXT: s_mul_i32 s23, s1, s13
+; GFX1250-NEXT: s_cselect_b32 s20, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT: s_mul_i32 s23, s2, s12
+; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT: s_mul_hi_u32 s34, s2, s12
+; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT: s_mul_i32 s23, s3, s11
+; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT: s_mul_hi_u32 s34, s3, s11
+; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT: s_mul_i32 s23, s4, s10
+; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT: s_mul_hi_u32 s34, s4, s10
+; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT: s_mul_i32 s23, s5, s9
+; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT: s_mul_hi_u32 s34, s5, s9
+; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT: s_mul_i32 s23, s6, s8
+; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT: s_mul_hi_u32 s34, s6, s8
+; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
+; GFX1250-NEXT: s_mul_i32 s23, s0, s13
+; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
+; GFX1250-NEXT: s_mul_hi_u32 s34, s0, s13
+; GFX1250-NEXT: s_add_co_u32 s23, s23, s24
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s34, s21
+; GFX1250-NEXT: s_mul_i32 s34, s1, s12
+; GFX1250-NEXT: s_mul_hi_u32 s35, s1, s12
+; GFX1250-NEXT: s_cselect_b32 s24, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s34, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s35, s21
+; GFX1250-NEXT: s_mul_i32 s35, s2, s11
+; GFX1250-NEXT: s_mul_hi_u32 s36, s2, s11
+; GFX1250-NEXT: s_cselect_b32 s34, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s35, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s36, s21
+; GFX1250-NEXT: s_mul_i32 s36, s3, s10
+; GFX1250-NEXT: s_mul_hi_u32 s37, s3, s10
+; GFX1250-NEXT: s_cselect_b32 s35, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s36, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s37, s21
+; GFX1250-NEXT: s_mul_i32 s37, s4, s9
+; GFX1250-NEXT: s_mul_hi_u32 s38, s4, s9
+; GFX1250-NEXT: s_cselect_b32 s36, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s37, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s38, s21
+; GFX1250-NEXT: s_mul_i32 s38, s5, s8
+; GFX1250-NEXT: s_mul_hi_u32 s39, s5, s8
+; GFX1250-NEXT: s_cselect_b32 s37, 1, 0
+; GFX1250-NEXT: s_add_co_u32 s23, s38, s23
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s39, s21
+; GFX1250-NEXT: s_cselect_b32 s38, 1, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s30, 0
+; GFX1250-NEXT: s_mul_i32 s1, s1, s14
+; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s31, 0
+; GFX1250-NEXT: s_mul_i32 s2, s2, s13
+; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s33, 0
+; GFX1250-NEXT: s_mul_i32 s3, s3, s12
+; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s20, 0
+; GFX1250-NEXT: s_mul_i32 s4, s4, s11
+; GFX1250-NEXT: s_add_co_ci_u32 s20, s29, s23
+; GFX1250-NEXT: s_cselect_b32 s23, 1, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s26, 0
+; GFX1250-NEXT: s_mul_i32 s26, s0, s15
+; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s27, 0
+; GFX1250-NEXT: s_mul_i32 s5, s5, s10
+; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s28, 0
+; GFX1250-NEXT: s_mul_i32 s6, s6, s9
+; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s23, 0
+; GFX1250-NEXT: s_mul_i32 s7, s7, s8
+; GFX1250-NEXT: s_add_co_ci_u32 s15, s25, s21
+; GFX1250-NEXT: s_add_co_ci_u32 s21, s22, s26
+; GFX1250-NEXT: s_cmp_lg_u32 s38, 0
+; GFX1250-NEXT: s_mul_i32 s0, s0, s8
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s21, s1
+; GFX1250-NEXT: s_cmp_lg_u32 s37, 0
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s2
+; GFX1250-NEXT: s_cmp_lg_u32 s36, 0
+; GFX1250-NEXT: s_mov_b32 s2, s17
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s3
+; GFX1250-NEXT: s_cmp_lg_u32 s35, 0
+; GFX1250-NEXT: s_mov_b32 s3, s18
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s4
+; GFX1250-NEXT: s_cmp_lg_u32 s34, 0
+; GFX1250-NEXT: s_mov_b32 s4, s19
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s5
+; GFX1250-NEXT: s_cmp_lg_u32 s24, 0
+; GFX1250-NEXT: s_mov_b32 s5, s20
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s6
+; GFX1250-NEXT: s_mov_b32 s6, s15
+; GFX1250-NEXT: s_add_co_i32 s7, s1, s7
+; GFX1250-NEXT: s_mov_b32 s1, s16
+; GFX1250-NEXT: ; return to shader part epilog
%result = mul i256 %num, %den
%cast = bitcast i256 %result to <8 x i32>
ret <8 x i32> %cast
@@ -2478,6 +2851,96 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_mul_i256:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v14, 0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], null, v0, v12, 0
+; GFX1250-NEXT: v_mul_lo_u32 v26, v6, v9
+; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v1, v13, v[16:17]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v2, v12, v[16:17]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], null, v0, v10, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v3, v11, v[16:17]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v4, v10, v[16:17]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v5, v9, v[16:17]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[16:17]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
+; GFX1250-NEXT: v_mov_b32_e32 v20, v19
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v24, vcc_lo
+; GFX1250-NEXT: v_cndmask_b32_e64 v19, 0, 1, s0
+; GFX1250-NEXT: v_mov_b32_e32 v21, v22
+; GFX1250-NEXT: v_mul_lo_u32 v22, v5, v10
+; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v2, v8, v[16:17]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v19, vcc_lo
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v0, v13, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v25
+; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], vcc_lo, v1, v12, v[16:17]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
+; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v8, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
+; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
+; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
+; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
+; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v27, v13, s2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v6, v11, s2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v26, s0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v8, v[0:1]
+; GFX1250-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v7, v8
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
}
@@ -2536,6 +2999,14 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%ext = zext i32 %val to i64
%mul = mul i64 %ext, 80
@@ -2632,6 +3103,21 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: s_mul_u64_zext_with_sregs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%ext = zext i32 %val to i64
%mul = mul i64 %ext, 80
@@ -2704,6 +3190,14 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
+; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%ext = sext i32 %val to i64
%mul = mul i64 %ext, 80
@@ -2815,6 +3309,20 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: s_mul_u64_sext_with_sregs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%ext = sext i32 %val to i64
%mul = mul i64 %ext, 80
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index 8bb060f..21f459a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE %s
-; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024 %s
+; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE %s
+; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024 %s
; FIXME: Generated test checks do not check metadata at the end of the
; function, so this also includes manually added checks.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir
index 2c545c8..1025d60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir
@@ -92,8 +92,7 @@ body: |
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; GCN-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s32)
- ; GCN-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 20
- ; GCN-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[ASHR]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = G_CONSTANT i32 16
%2:_(s32) = G_ASHR %0, %1(s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll
new file mode 100644
index 0000000..5f72d3e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck %s
+
+define amdgpu_ps void @readanylane_to_virtual_vgpr(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; CHECK-LABEL: readanylane_to_virtual_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dword v0, v1, s[2:3]
+; CHECK-NEXT: s_endpgm
+ %load = load volatile float, ptr addrspace(1) %ptr0
+ store float %load, ptr addrspace(1) %ptr1
+ ret void
+}
+
+define amdgpu_ps float @readanylane_to_physical_vgpr(ptr addrspace(1) inreg %ptr) {
+; CHECK-LABEL: readanylane_to_physical_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %load = load volatile float, ptr addrspace(1) %ptr
+ ret float %load
+}
+
+define amdgpu_ps void @readanylane_to_bitcast_to_virtual_vgpr(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; CHECK-LABEL: readanylane_to_bitcast_to_virtual_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dword v0, v1, s[2:3]
+; CHECK-NEXT: s_endpgm
+ %load = load volatile <2 x i16>, ptr addrspace(1) %ptr0
+ %bitcast = bitcast <2 x i16> %load to i32
+ store i32 %bitcast, ptr addrspace(1) %ptr1
+ ret void
+}
+
+define amdgpu_ps float @readanylane_to_bitcast_to_physical_vgpr(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; CHECK-LABEL: readanylane_to_bitcast_to_physical_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %load = load volatile <2 x i16>, ptr addrspace(1) %ptr0
+ %bitcast = bitcast <2 x i16> %load to float
+ ret float %bitcast
+}
+
+define amdgpu_ps void @unmerge_readanylane_merge_to_virtual_vgpr(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; CHECK-LABEL: unmerge_readanylane_merge_to_virtual_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; CHECK-NEXT: s_endpgm
+ %load = load volatile i64, ptr addrspace(1) %ptr0
+ store i64 %load, ptr addrspace(1) %ptr1
+ ret void
+}
+
+;define amdgpu_ps double @unmerge_readanylane_merge_to_physical_vgpr(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; %load = load volatile double, ptr addrspace(1) %ptr0
+; ret double %load
+;}
+
+define amdgpu_ps void @unmerge_readanylane_merge_bitcast_to_virtual_vgpr(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; CHECK-LABEL: unmerge_readanylane_merge_bitcast_to_virtual_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; CHECK-NEXT: s_endpgm
+ %load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
+ %bitcast = bitcast <2 x i32> %load to double
+ store double %bitcast, ptr addrspace(1) %ptr1
+ ret void
+}
+
+;define amdgpu_ps double @unmerge_readanylane_merge_bitcast_to_physical_vgpr(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; %load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
+; %bitcast = bitcast <2 x i32> %load to double
+; ret double %bitcast
+;}
+
+define amdgpu_ps void @unmerge_readanylane_merge_extract_to_virtual_vgpr(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; CHECK-LABEL: unmerge_readanylane_merge_extract_to_virtual_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dword v2, v1, s[2:3]
+; CHECK-NEXT: s_endpgm
+ %load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
+ %extracted = extractelement <2 x i32> %load, i32 1
+ store i32 %extracted, ptr addrspace(1) %ptr1
+ ret void
+}
+
+define amdgpu_ps float @unmerge_readanylane_merge_extract_to_physical_vgpr(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; CHECK-LABEL: unmerge_readanylane_merge_extract_to_physical_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v1
+; CHECK-NEXT: ; return to shader part epilog
+ %load = load volatile <2 x float>, ptr addrspace(1) %ptr0
+ %extracted = extractelement <2 x float> %load, i32 1
+ ret float %extracted
+}
+
+define amdgpu_ps void @unmerge_readanylane_merge_extract_bitcast_to_virtual_vgpr(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; CHECK-LABEL: unmerge_readanylane_merge_extract_bitcast_to_virtual_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
+; CHECK-NEXT: s_endpgm
+ %load = load volatile <4 x i16>, ptr addrspace(1) %ptr0
+ %extracted = shufflevector <4 x i16> %load, <4 x i16> %load, <2 x i32> <i32 0, i32 1>
+ %bitcast = bitcast <2 x i16> %extracted to float
+ store float %bitcast, ptr addrspace(1) %ptr1
+ ret void
+}
+
+define amdgpu_ps float @unmerge_readanylane_merge_extract_bitcast_to_physical_vgpr(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) {
+; CHECK-LABEL: unmerge_readanylane_merge_extract_bitcast_to_physical_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %load = load volatile <4 x i16>, ptr addrspace(1) %ptr0
+ %extracted = shufflevector <4 x i16> %load, <4 x i16> %load, <2 x i32> <i32 0, i32 1>
+ %bitcast = bitcast <2 x i16> %extracted to float
+ ret float %bitcast
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir
new file mode 100644
index 0000000..dd7a3eb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir
@@ -0,0 +1,353 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s
+
+---
+name: readanylane_to_virtual_vgpr
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+ ; CHECK-LABEL: name: readanylane_to_virtual_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (s32), addrspace 1)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
+ ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:sgpr(s32) = COPY $sgpr2
+ %4:sgpr(s32) = COPY $sgpr3
+ %5:sgpr(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+ %6:sgpr(s32) = G_LOAD %2(p1) :: (volatile "amdgpu-noclobber" load (s32), addrspace 1)
+ G_STORE %6(s32), %5(p1) :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: readanylane_to_physical_vgpr
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: readanylane_to_physical_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (s32), addrspace 1)
+ ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:sgpr(s32) = G_LOAD %2(p1) :: (volatile "amdgpu-noclobber" load (s32), addrspace 1)
+ $vgpr0 = COPY %3(s32)
+ SI_RETURN_TO_EPILOG implicit $vgpr0
+...
+
+---
+name: readanylane_to_bitcast_to_virtual_vgpr
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+ ; CHECK-LABEL: name: readanylane_to_bitcast_to_virtual_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s16>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s16>), addrspace 1)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
+ ; CHECK-NEXT: G_STORE [[BITCAST]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:sgpr(s32) = COPY $sgpr2
+ %4:sgpr(s32) = COPY $sgpr3
+ %5:sgpr(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+ %6:sgpr(<2 x s16>) = G_LOAD %2(p1) :: (volatile "amdgpu-noclobber" load (<2 x s16>), addrspace 1)
+ %7:sgpr(s32) = G_BITCAST %6(<2 x s16>)
+ G_STORE %7(s32), %5(p1) :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: readanylane_to_bitcast_to_physical_vgpr
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+ ; CHECK-LABEL: name: readanylane_to_bitcast_to_physical_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s16>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s16>), addrspace 1)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+ ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST]](s32)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:sgpr(<2 x s16>) = G_LOAD %2(p1) :: (volatile "amdgpu-noclobber" load (<2 x s16>), addrspace 1)
+ %4:sgpr(s32) = G_BITCAST %3(<2 x s16>)
+ $vgpr0 = COPY %4(s32)
+ SI_RETURN_TO_EPILOG implicit $vgpr0
+...
+
+---
+name: unmerge_readanylane_merge_to_virtual_vgpr
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+ ; CHECK-LABEL: name: unmerge_readanylane_merge_to_virtual_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s64) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (s64), addrspace 1)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
+ ; CHECK-NEXT: G_STORE [[LOAD]](s64), [[COPY4]](p1) :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:sgpr(s32) = COPY $sgpr2
+ %4:sgpr(s32) = COPY $sgpr3
+ %5:sgpr(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+ %6:sgpr(s64) = G_LOAD %2(p1) :: (volatile "amdgpu-noclobber" load (s64), addrspace 1)
+ G_STORE %6(s64), %5(p1) :: (store (s64), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: unmerge_readanylane_merge_to_physical_vgpr
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+ ; CHECK-LABEL: name: unmerge_readanylane_merge_to_physical_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s64) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (s64), addrspace 1)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:sgpr(s64) = G_LOAD %2(p1) :: (volatile "amdgpu-noclobber" load (s64), addrspace 1)
+ $vgpr0_vgpr1 = COPY %3(s64)
+ SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+...
+
+---
+name: unmerge_readanylane_merge_bitcast_to_virtual_vgpr
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+ ; CHECK-LABEL: name: unmerge_readanylane_merge_bitcast_to_virtual_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s64) = G_BITCAST [[LOAD]](<2 x s32>)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
+ ; CHECK-NEXT: G_STORE [[BITCAST]](s64), [[COPY4]](p1) :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:sgpr(s32) = COPY $sgpr2
+ %4:sgpr(s32) = COPY $sgpr3
+ %5:sgpr(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+ %6:sgpr(<2 x s32>) = G_LOAD %2(p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
+ %7:sgpr(s64) = G_BITCAST %6(<2 x s32>)
+ G_STORE %7(s64), %5(p1) :: (store (s64), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: unmerge_readanylane_merge_bitcast_to_physical_vgpr
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+ ; CHECK-LABEL: name: unmerge_readanylane_merge_bitcast_to_physical_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s64) = G_BITCAST [[LOAD]](<2 x s32>)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:sgpr(<2 x s32>) = G_LOAD %2(p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
+ %4:sgpr(s64) = G_BITCAST %3(<2 x s32>)
+ $vgpr0_vgpr1 = COPY %4(s64)
+ SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+...
+
+---
+name: unmerge_readanylane_merge_extract_to_virtual_vgpr
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+ ; CHECK-LABEL: name: unmerge_readanylane_merge_extract_to_virtual_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
+ ; CHECK-NEXT: G_STORE [[UV1]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:sgpr(s32) = COPY $sgpr2
+ %4:sgpr(s32) = COPY $sgpr3
+ %5:sgpr(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+ %6:sgpr(<2 x s32>) = G_LOAD %2(p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
+ %7:sgpr(s32), %8:sgpr(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+ G_STORE %8(s32), %5(p1) :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: unmerge_readanylane_merge_extract_to_physical_vgpr
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+ ; CHECK-LABEL: name: unmerge_readanylane_merge_extract_to_physical_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; CHECK-NEXT: $vgpr0 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:sgpr(<2 x s32>) = G_LOAD %2(p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
+ %4:sgpr(s32), %5:sgpr(s32) = G_UNMERGE_VALUES %3(<2 x s32>)
+ $vgpr0 = COPY %5(s32)
+ SI_RETURN_TO_EPILOG implicit $vgpr0
+...
+
+---
+name: unmerge_readanylane_merge_extract_bitcast_to_virtual_vgpr
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+ ; CHECK-LABEL: name: unmerge_readanylane_merge_extract_bitcast_to_virtual_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s16>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<4 x s16>), addrspace 1)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s16>), [[UV1:%[0-9]+]]:vgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV]](<2 x s16>)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
+ ; CHECK-NEXT: G_STORE [[BITCAST]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:sgpr(s32) = COPY $sgpr2
+ %4:sgpr(s32) = COPY $sgpr3
+ %5:sgpr(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+ %6:sgpr(<4 x s16>) = G_LOAD %2(p1) :: (volatile "amdgpu-noclobber" load (<4 x s16>), addrspace 1)
+ %7:sgpr(<2 x s16>), %8:sgpr(<2 x s16>) = G_UNMERGE_VALUES %6(<4 x s16>)
+ %9:sgpr(s32) = G_BITCAST %7(<2 x s16>)
+ G_STORE %9(s32), %5(p1) :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: unmerge_readanylane_merge_extract_bitcast_to_physical_vgpr
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+ ; CHECK-LABEL: name: unmerge_readanylane_merge_extract_bitcast_to_physical_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s16>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<4 x s16>), addrspace 1)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s16>), [[UV1:%[0-9]+]]:vgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV]](<2 x s16>)
+ ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST]](s32)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:sgpr(<4 x s16>) = G_LOAD %2(p1) :: (volatile "amdgpu-noclobber" load (<4 x s16>), addrspace 1)
+ %4:sgpr(<2 x s16>), %5:sgpr(<2 x s16>) = G_UNMERGE_VALUES %3(<4 x s16>)
+ %6:sgpr(s32) = G_BITCAST %4(<2 x s16>)
+ $vgpr0 = COPY %6(s32)
+ SI_RETURN_TO_EPILOG implicit $vgpr0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
index 3df5a16..199fd15 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
@@ -69,20 +69,19 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3
- ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32)
; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
- ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
- ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT4]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
@@ -116,7 +115,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
@@ -125,16 +124,15 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
- ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY5]]
- ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1)
- ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[INTRINSIC_CONVERGENT]](s32), [[COPY5]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1)
+ ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT1]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[INTRINSIC_CONVERGENT]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
; CHECK-NEXT: {{ $}}
@@ -163,7 +161,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr5
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
@@ -172,28 +170,27 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3
- ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32)
; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
- ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+ ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[INTRINSIC_CONVERGENT4]](s32), [[COPY5]]
; CHECK-NEXT: [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]]
- ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1)
- ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1)
+ ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT5]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[INTRINSIC_CONVERGENT4]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
; CHECK-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll
index 840b1e8..6b6f611 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @raw_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
@@ -69,20 +69,19 @@ define amdgpu_ps float @raw_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3
- ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32)
; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
- ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
- ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT4]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
@@ -115,7 +114,7 @@ define amdgpu_ps float @raw_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
@@ -125,16 +124,15 @@ define amdgpu_ps float @raw_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
- ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY5]]
- ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1)
- ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[INTRINSIC_CONVERGENT]](s32), [[COPY5]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1)
+ ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT1]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
+ ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[INTRINSIC_CONVERGENT]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
; CHECK-NEXT: {{ $}}
@@ -162,7 +160,7 @@ define amdgpu_ps float @raw_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr5
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
@@ -172,28 +170,27 @@ define amdgpu_ps float @raw_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3
- ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32)
; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
- ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+ ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[INTRINSIC_CONVERGENT4]](s32), [[COPY5]]
; CHECK-NEXT: [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]]
- ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1)
- ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1)
+ ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT5]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
+ ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[INTRINSIC_CONVERGENT4]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
; CHECK-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
index 0df8e68..9474bb6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
@@ -67,20 +67,19 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3
- ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32)
; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
- ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
- ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT4]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
@@ -115,23 +114,22 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgp
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
- ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY6]]
- ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1)
- ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[INTRINSIC_CONVERGENT]](s32), [[COPY6]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1)
+ ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT1]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[INTRINSIC_CONVERGENT]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
; CHECK-NEXT: {{ $}}
@@ -161,35 +159,34 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr6
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3
- ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32)
; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
- ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+ ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[INTRINSIC_CONVERGENT4]](s32), [[COPY6]]
; CHECK-NEXT: [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]]
- ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1)
- ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1)
+ ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT5]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[INTRINSIC_CONVERGENT4]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
; CHECK-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll
index 9acc9d0..fe848ff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @struct_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
@@ -67,20 +67,19 @@ define amdgpu_ps float @struct_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3
- ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32)
; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
- ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
- ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+ ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT4]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
@@ -114,7 +113,7 @@ define amdgpu_ps float @struct_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -122,16 +121,15 @@ define amdgpu_ps float @struct_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
- ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY6]]
- ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1)
- ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[INTRINSIC_CONVERGENT]](s32), [[COPY6]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1)
+ ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT1]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
+ ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[INTRINSIC_CONVERGENT]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
; CHECK-NEXT: {{ $}}
@@ -160,7 +158,7 @@ define amdgpu_ps float @struct_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr6
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -168,28 +166,27 @@ define amdgpu_ps float @struct_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3
- ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32)
; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
- ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
- ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+ ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[INTRINSIC_CONVERGENT4]](s32), [[COPY6]]
; CHECK-NEXT: [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]]
- ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1)
- ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1)
+ ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT5]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
+ ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[INTRINSIC_CONVERGENT4]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
; CHECK-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and-s1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and-s1.mir
index d446f6b..71adf63 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and-s1.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-and-s1.mir
@@ -14,12 +14,14 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_TRUNC %0
%3:_(s1) = G_TRUNC %1
%4:_(s1) = G_AND %2, %3
%5:_(s32) = G_ANYEXT %4
+ S_ENDPGM 0, implicit %5
...
---
@@ -38,6 +40,7 @@ body: |
; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[ICMP1]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_CONSTANT i32 0
@@ -45,6 +48,7 @@ body: |
%4:_(s1) = G_ICMP intpred(eq), %1, %2
%5:_(s1) = G_AND %3, %4
%6:_(s32) = G_ANYEXT %5
+ S_ENDPGM 0, implicit %6
...
---
@@ -309,6 +313,7 @@ body: |
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY]], [[COPY1]]
; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[COPY2]], [[AND]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[AND1]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = COPY $sgpr0
@@ -318,4 +323,5 @@ body: |
%6:_(s1) = G_AND %3, %4
%7:_(s1) = G_AND %5, %6
%8:_(s32) = G_ANYEXT %7
+ S_ENDPGM 0, implicit %8
...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir
index 9260b06..d954ba0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir
@@ -68,10 +68,12 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[ICMP]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_ICMP intpred(eq), %0, %1
%3:_(s32) = G_ANYEXT %2
+ S_ENDPGM 0, implicit %3
...
---
@@ -191,9 +193,11 @@ body: |
; CHECK: liveins: $sgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s1) = G_TRUNC %0
%2:_(s32) = G_ANYEXT %1
+ S_ENDPGM 0, implicit %2
...
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-trunc.mir
index 0069692..3744bc9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-trunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-trunc.mir
@@ -83,9 +83,11 @@ body: |
; CHECK: liveins: $sgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s1) = G_TRUNC %0
%2:_(s32) = G_ANYEXT %1
+ S_ENDPGM 0, implicit %2
...
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir
new file mode 100644
index 0000000..beca901
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+---
+name: basic_test
+legalized: true
+machineFunctionInfo:
+ isWholeWaveFunction: true
+body: |
+ bb.1:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: basic_test
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:vcc(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[COPY2]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+ ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[COPY3]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0
+ ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+ %1:_(s32) = COPY $vgpr0
+ %2:_(s32) = COPY $vgpr1
+ %0:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+ %12:_(s32) = G_CONSTANT i32 5
+ %11:_(s32) = G_SELECT %0(s1), %1, %12
+ %14:_(s32) = G_CONSTANT i32 3
+ %13:_(s32) = G_SELECT %0(s1), %2, %14
+ %15:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), %11(s32), %13(s32), 1, 1, 1, 0
+ $vgpr0 = COPY %15(s32)
+ G_AMDGPU_WHOLE_WAVE_FUNC_RETURN %0(s1), implicit $vgpr0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 02f8d0b..1441591 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) {
; GFX8-LABEL: sdivrem_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll
index ee3bf96..344b4ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 %s -o - | FileCheck -check-prefixes=GCN %s
define half @test_s16(half %a) #0 {
; GCN-LABEL: test_s16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll
index c82b130..9d6e074 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding -global-isel | FileCheck --check-prefix=GCN %s
; GCN-LABEL: vs_epilog
; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 0806eec..256d6d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; Test optimization to reduce shifts to narrower sizes.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
index 91f71a8..ad60a61 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; Test gfx9+ s_shl[1-4]_add_u32 pattern matching
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll
index 09274c4..084f240 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -mtriple=amdgcn-amd-hmcsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -global-isel -march=amdgcn -mtriple=amdgcn-amd-hmcsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
define void @shuffle_to_extract(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GFX942-LABEL: shuffle_to_extract:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
index 1d94d76..ac1e11b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
define i32 @test_min_max_ValK0_K1_i32(i32 %a) {
; GFX89-LABEL: test_min_max_ValK0_K1_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
index eebe9cd..766b869 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=SI,GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=CI,GCN,SICIVI %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=VI,GCN,SICIVI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -show-mc-encoding -verify-machineinstrs -global-isel < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding -verify-machineinstrs -global-isel < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding -global-isel | FileCheck --check-prefixes=SI,GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding -global-isel | FileCheck --check-prefixes=CI,GCN,SICIVI %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding -global-isel | FileCheck --check-prefixes=VI,GCN,SICIVI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -show-mc-encoding -global-isel < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding -global-isel < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s
; SMRD load with an immediate offset.
; GCN-LABEL: {{^}}smrd0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index e81bae5..38ef707 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
; FIXME:
-; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
+; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) {
; GFX9-LABEL: store_lds_v4i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
index 030f01a..1d2d330 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
; FIXME:
-; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
+; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) {
; GFX9-LABEL: store_lds_v3i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
index fe2667b..017575b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
define i32 @v_usubo_i32(i32 %a, i32 %b) {
; GFX7-LABEL: v_usubo_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
index 569ed35b..c199923 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=bonaire < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
define i16 @v_trunc_i32_to_i16(i32 %src) {
; GFX7-LABEL: v_trunc_i32_to_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 1aaf312..ba5a8e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) {
; GFX8-LABEL: udivrem_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
index a8233054..2b54123 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
define i32 @test_min_max_ValK0_K1_u32(i32 %a) {
; GFX89-LABEL: test_min_max_ValK0_K1_u32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll
index 7c9e2a5..5408ad0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -verify-machineinstrs < %s | FileCheck --check-prefix=PREGFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=PREGFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=PREGFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefix=PREGFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel < %s | FileCheck --check-prefix=PREGFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=hawaii < %s | FileCheck --check-prefix=PREGFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=fiji < %s | FileCheck --check-prefix=PREGFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=gfx90a < %s | FileCheck --check-prefix=PREGFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=gfx1030 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX10PLUS %s
define i32 @check_v_bfe(i16 %a) {
; PREGFX9-LABEL: check_v_bfe:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
index 6730df0..d28840d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 {
; GFX8-LABEL: constant_load_i8_align4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
index cc1c93a..9693d54 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
index 4959e10..6b749df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
index 22c61f9..929a51b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
index 7eafe53..7c0f726 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
index 8049711..da61bc4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index f16ea18..a345ee6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
index 173dd01..5344ab8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
index 83bbf56..e47350d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
index 1e9ef07..da68520 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
index f01679f..957b7b1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
index 7d74524..427191a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX900 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GCN,GFX906 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps i32 @scalar_xnor_i32_one_use(i32 inreg %a, i32 inreg %b) {
; GCN-LABEL: scalar_xnor_i32_one_use:
diff --git a/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll b/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
index a17ad6b..a8bdb41 100644
--- a/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
+++ b/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
; CHECK: ;;#ASMSTART
; CHECK-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
index 726bfba..be4e369 100644
--- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -early-live-intervals < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll
new file mode 100644
index 0000000..b992506
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/add-max.ll
@@ -0,0 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: add_max_u32_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_max_u32_e32 v0, v0, v2
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i32 %a, %b
+ %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
+ %ret = bitcast i32 %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
+; GCN-LABEL: add_max_u32_svv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_max_u32_e32 v0, v0, v1
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i32 %a, %b
+ %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
+ %ret = bitcast i32 %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
+; GCN-LABEL: add_max_u32_ssv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_add_co_i32 s0, s0, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_max_u32_e32 v0, s0, v0
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i32 %a, %b
+ %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
+ %ret = bitcast i32 %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GCN-LABEL: add_max_u32_sss:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_add_co_i32 s0, s0, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_max_u32 s0, s0, s2
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i32 %a, %b
+ %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
+ %ret = bitcast i32 %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
+; GCN-LABEL: add_max_u32_vsi:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_max_u32_e32 v0, 4, v0
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i32 %a, %b
+ %max = call i32 @llvm.umax.i32(i32 %add, i32 4)
+ %ret = bitcast i32 %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
+; GCN-LABEL: add_max_u32_svl:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_max_u32_e32 v0, 0x64, v0
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i32 %a, %b
+ %max = call i32 @llvm.umax.i32(i32 %add, i32 100)
+ %ret = bitcast i32 %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
+; GCN-LABEL: add_max_u32_slv:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_addk_co_i32 s0, 0x64
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_max_u32_e32 v0, s0, v0
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i32 %a, 100
+ %max = call i32 @llvm.umax.i32(i32 %add, i32 %b)
+ %ret = bitcast i32 %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: add_max_i32_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_max_i32_e32 v0, v0, v2
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i32 %a, %b
+ %max = call i32 @llvm.smax.i32(i32 %add, i32 %c)
+ %ret = bitcast i32 %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: add_min_u32_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_min_u32_e32 v0, v0, v2
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i32 %a, %b
+ %max = call i32 @llvm.umin.i32(i32 %add, i32 %c)
+ %ret = bitcast i32 %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: add_min_i32_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_min_i32_e32 v0, v0, v2
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i32 %a, %b
+ %max = call i32 @llvm.smin.i32(i32 %add, i32 %c)
+ %ret = bitcast i32 %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: add_max_v2u16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2
+; GCN-NEXT: ; return to shader part epilog
+ %add = add <2 x i16> %a, %b
+ %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
+ %ret = bitcast <2 x i16> %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: add_max_v2u16_svv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, v1
+; GCN-NEXT: ; return to shader part epilog
+ %add = add <2 x i16> %a, %b
+ %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
+ %ret = bitcast <2 x i16> %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) {
+; SDAG-LABEL: add_max_v2u16_ssv:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: add_max_v2u16_ssv:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_lshr_b32 s2, s0, 16
+; GISEL-NEXT: s_lshr_b32 s3, s1, 16
+; GISEL-NEXT: s_add_co_i32 s0, s0, s1
+; GISEL-NEXT: s_add_co_i32 s2, s2, s3
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
+; GISEL-NEXT: ; return to shader part epilog
+ %add = add <2 x i16> %a, %b
+ %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
+ %ret = bitcast <2 x i16> %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) {
+; SDAG-LABEL: add_max_v2u16_sss:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_pk_add_u16 v0, s0, s1
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT: v_pk_max_u16 v0, v0, s2
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: add_max_v2u16_sss:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_lshr_b32 s3, s0, 16
+; GISEL-NEXT: s_lshr_b32 s4, s1, 16
+; GISEL-NEXT: s_add_co_i32 s0, s0, s1
+; GISEL-NEXT: s_add_co_i32 s3, s3, s4
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s3
+; GISEL-NEXT: s_and_b32 s3, s2, 0xffff
+; GISEL-NEXT: s_lshr_b32 s1, s0, 16
+; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
+; GISEL-NEXT: s_lshr_b32 s2, s2, 16
+; GISEL-NEXT: s_max_u32 s0, s0, s3
+; GISEL-NEXT: s_max_u32 s1, s1, s2
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: ; return to shader part epilog
+ %add = add <2 x i16> %a, %b
+ %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
+ %ret = bitcast <2 x i16> %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
+; GCN-LABEL: add_max_v2u16_vsi:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_max_u16 v0, v0, s0, 4
+; GCN-NEXT: ; return to shader part epilog
+ %add = add <2 x i16> %a, %b
+ %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>)
+ %ret = bitcast <2 x i16> %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
+; GCN-LABEL: add_max_v2u16_svl:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, 0x650064
+; GCN-NEXT: ; return to shader part epilog
+ %add = add <2 x i16> %a, %b
+ %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>)
+ %ret = bitcast <2 x i16> %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
+; SDAG-LABEL: add_max_v2u16_slv:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: add_max_v2u16_slv:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_lshr_b32 s1, s0, 16
+; GISEL-NEXT: s_add_co_i32 s0, s0, 0x640064
+; GISEL-NEXT: s_addk_co_i32 s1, 0x64
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
+; GISEL-NEXT: ; return to shader part epilog
+ %add = add <2 x i16> %a, <i16 100, i16 100>
+ %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b)
+ %ret = bitcast <2 x i16> %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: add_max_v2s16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2
+; GCN-NEXT: ; return to shader part epilog
+ %add = add <2 x i16> %a, %b
+ %max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c)
+ %ret = bitcast <2 x i16> %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: add_min_v2u16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2
+; GCN-NEXT: ; return to shader part epilog
+ %add = add <2 x i16> %a, %b
+ %max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c)
+ %ret = bitcast <2 x i16> %max to float
+ ret float %ret
+}
+
+define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: add_min_v2s16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2
+; GCN-NEXT: ; return to shader part epilog
+ %add = add <2 x i16> %a, %b
+ %max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c)
+ %ret = bitcast <2 x i16> %max to float
+ ret float %ret
+}
+
+declare <2 x i16> @llvm.smin.v216(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.smax.v216(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.umin.v216(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.umax.v216(<2 x i16>, <2 x i16>)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/add.i16.ll b/llvm/test/CodeGen/AMDGPU/add.i16.ll
index 417ff54..dd3aa2c 100644
--- a/llvm/test/CodeGen/AMDGPU/add.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_add_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index 58a2ab0..b8814b6 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX6-LABEL: s_add_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 6cb236d..d25bfbb 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; FIXME: Need to handle non-uniform case for function below (load without gep).
; FIXME: VI or should be unnecessary
diff --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll
index 0d80296..df888b5 100644
--- a/llvm/test/CodeGen/AMDGPU/add3.ll
+++ b/llvm/test/CodeGen/AMDGPU/add3.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji | FileCheck -check-prefix=VI %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 | FileCheck -check-prefix=GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 | FileCheck -check-prefix=GFX10 %s
; ===================================================================================
; V_ADD3_U32
diff --git a/llvm/test/CodeGen/AMDGPU/add_i1.ll b/llvm/test/CodeGen/AMDGPU/add_i1.ll
index c0d73fc1..ca60598 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i1.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
; GFX9-LABEL: add_var_var_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/add_i128.ll b/llvm/test/CodeGen/AMDGPU/add_i128.ll
index c2c5046..dcaa856 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i128.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @test_i128_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
; GCN-LABEL: test_i128_vreg:
diff --git a/llvm/test/CodeGen/AMDGPU/add_i64.ll b/llvm/test/CodeGen/AMDGPU/add_i64.ll
index 9400bf6..eedd56d 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
declare i32 @llvm.amdgcn.workitem.id.x() readnone
diff --git a/llvm/test/CodeGen/AMDGPU/add_shl.ll b/llvm/test/CodeGen/AMDGPU/add_shl.ll
index b1d88a5..03002ed 100644
--- a/llvm/test/CodeGen/AMDGPU/add_shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_shl.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji | FileCheck -check-prefix=VI %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 | FileCheck -check-prefix=GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 | FileCheck -check-prefix=GFX10 %s
; ===================================================================================
; V_ADD_LSHL_U32
diff --git a/llvm/test/CodeGen/AMDGPU/add_u64.ll b/llvm/test/CodeGen/AMDGPU/add_u64.ll
new file mode 100644
index 0000000..03730272
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/add_u64.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
+
+define amdgpu_ps <2 x float> @test_add_u64_vv(i64 %a, i64 %b) {
+; GFX12-LABEL: test_add_u64_vv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: ; return to shader part epilog
+ %add = add i64 %a, %b
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_vs(i64 %a, i64 inreg %b) {
+; GFX12-LABEL: test_add_u64_vs:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_vs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %add = add i64 %a, %b
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_sv(i64 inreg %a, i64 %b) {
+; GFX12-LABEL: test_add_u64_sv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_sv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %add = add i64 %a, %b
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_ss(i64 inreg %a, i64 inreg %b) {
+; GCN-LABEL: test_add_u64_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i64 %a, %b
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_v_inline_lit(i64 %a) {
+; GFX12-LABEL: test_add_u64_v_inline_lit:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_v_inline_lit:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 5, v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %add = add i64 %a, 5
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_v_small_imm(i64 %a) {
+; GFX12-LABEL: test_add_u64_v_small_imm:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x1f4, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_v_small_imm:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 0x1f4, v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %add = add i64 %a, 500
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_v_64bit_imm(i64 %a) {
+; GFX12-LABEL: test_add_u64_v_64bit_imm:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x3b9ac9ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_v_64bit_imm:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0x13b9ac9ff), v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %add = add i64 %a, 5294967295
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_add_u64_s_small_imm(i64 inreg %a) {
+; GCN-LABEL: test_add_u64_s_small_imm:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x1f4
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+ %add = add i64 %a, 500
+ %ret = bitcast i64 %add to <2 x float>
+ ret <2 x float> %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
index 30ae18f..5afd3ea 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=amdgcn -verify-machineinstrs -amdgpu-enable-lower-module-lds=false < %s 2> %t.err | FileCheck %s
+; RUN: not llc -mtriple=amdgcn -amdgpu-enable-lower-module-lds=false < %s 2> %t.err | FileCheck %s
; RUN: FileCheck -check-prefix=ERROR %s < %t.err
; ERROR: error: unsupported expression in static initializer: addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4))
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer.ll
index ab73b51..732372a 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
; CHECK: global.arr:
; CHECK: .zero 1024
diff --git a/llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll b/llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll
index d3bf94e8..c4f6079 100644
--- a/llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}adjust_writemask_crash_0_nochain:
; GCN: image_get_lod v0, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2
diff --git a/llvm/test/CodeGen/AMDGPU/adjust-writemask-vectorized.ll b/llvm/test/CodeGen/AMDGPU/adjust-writemask-vectorized.ll
index 30c5ccb..00c5798 100644
--- a/llvm/test/CodeGen/AMDGPU/adjust-writemask-vectorized.ll
+++ b/llvm/test/CodeGen/AMDGPU/adjust-writemask-vectorized.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GCN %s
; Check that write mask is 0xf.
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
index e6e9ee7..63b7b70 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
define void @func_empty() #0 {
; GCN-LABEL: func_empty:
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index c7a20055..6e36093 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN,GFX90A %s
; GCN-LABEL: {{^}}kernel_32_agprs:
; GFX908: .amdhsa_next_free_vgpr 32
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
index f6465de..1a2dd6e 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
; Make sure there are no v_accvgpr_read_b32 copying back and forth
; between AGPR and VGPR.
diff --git a/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll b/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll
index fc13262..e65f401 100644
--- a/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll
+++ b/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}alignbit_shr_pat:
; GCN-DAG: s_load_dword s[[SHR:[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
index 4e70227..689b306 100644
--- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple amdgcn-amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple amdgcn-amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.readfirstlane(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
index c31b2ce..3b9682e 100644
--- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
+++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX9
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefix=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s --check-prefix=GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefix=GFX11
define amdgpu_kernel void @test0() {
; GFX9-LABEL: test0:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
index f9b7546..f96a6f7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}kernel_ieee_mode_default:
; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index 95f5947..279d2e2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs -stop-after=finalize-isel -o - %s | FileCheck -check-prefixes=GCN,SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs -stop-after=finalize-isel -o - %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -stop-after=finalize-isel -o - %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -stop-after=finalize-isel -o - %s | FileCheck -check-prefixes=GCN,GISEL %s
@0 = external dso_local addrspace(4) constant [4 x <2 x float>]
@1 = external dso_local addrspace(4) constant i32
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
index c9a4379..50daf98 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,GCN-PROMOTE %s
-; RUN: llc -mattr=+promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=GCN,GCN-PROMOTE %s
-; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,GCN-ALLOCA %s
-; RUN: llc -mattr=-promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=GCN,GCN-ALLOCA %s
-; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,GCN-PROMOTE %s
-; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,GCN-ALLOCA %s
+; RUN: llc -mattr=+promote-alloca -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,GCN-PROMOTE %s
+; RUN: llc -mattr=+promote-alloca,-flat-for-global -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=GCN,GCN-PROMOTE %s
+; RUN: llc -mattr=-promote-alloca -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,GCN-ALLOCA %s
+; RUN: llc -mattr=-promote-alloca,-flat-for-global -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=GCN,GCN-ALLOCA %s
+; RUN: llc -mattr=+promote-alloca -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,GCN-PROMOTE %s
+; RUN: llc -mattr=-promote-alloca -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,GCN-ALLOCA %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
index d58a624..18ec3ab 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
@@ -14,8 +14,7 @@ define internal fastcc void @foo(ptr %kg) {
; CHECK-NEXT: [[NUM_CLOSURE_I26_I:%.*]] = getelementptr i8, ptr [[KG]], i64 276
; CHECK-NEXT: br label %[[WHILE_COND:.*]]
; CHECK: [[WHILE_COND]]:
-; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[KG]] to ptr addrspace(5)
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[KG]], align 4
; CHECK-NEXT: [[IDXPROM_I:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-NEXT: switch i32 0, label %[[SW_BB92:.*]] [
; CHECK-NEXT: i32 1, label %[[SW_BB92]]
@@ -23,22 +22,18 @@ define internal fastcc void @foo(ptr %kg) {
; CHECK-NEXT: ]
; CHECK: [[SUBD_TRIANGLE_PATCH_EXIT_I_I35]]:
; CHECK-NEXT: [[ARRAYIDX_I27_I:%.*]] = getelementptr float, ptr [[KG]], i64 [[IDXPROM_I]]
-; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[ARRAYIDX_I27_I]] to ptr addrspace(5)
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX_I27_I]], align 4
; CHECK-NEXT: br label %[[WHILE_COND]]
; CHECK: [[SW_BB92]]:
; CHECK-NEXT: [[INSERT:%.*]] = insertelement <3 x i32> zeroinitializer, i32 [[TMP1]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT_I:%.*]] = bitcast <3 x i32> [[INSERT]] to <3 x float>
; CHECK-NEXT: [[SHFL:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT_I]], <3 x float> zeroinitializer, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[NUM_CLOSURE_I26_I]] to ptr addrspace(5)
-; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[NUM_CLOSURE_I26_I]], align 4
; CHECK-NEXT: [[IDXPROM_I27_I:%.*]] = sext i32 [[LOAD]] to i64
; CHECK-NEXT: [[ARRAYIDX_I28_I:%.*]] = getelementptr [64 x %struct.ShaderClosure], ptr [[CLOSURE_I25_I]], i64 0, i64 [[IDXPROM_I27_I]]
-; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[ARRAYIDX_I28_I]] to ptr addrspace(5)
-; CHECK-NEXT: store <4 x float> [[SHFL]], ptr addrspace(5) [[TMP4]], align 16
+; CHECK-NEXT: store <4 x float> [[SHFL]], ptr [[ARRAYIDX_I28_I]], align 16
; CHECK-NEXT: [[INC_I30_I:%.*]] = or i32 [[LOAD]], 1
-; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[NUM_CLOSURE_I26_I]] to ptr addrspace(5)
-; CHECK-NEXT: store i32 [[INC_I30_I]], ptr addrspace(5) [[TMP5]], align 4
+; CHECK-NEXT: store i32 [[INC_I30_I]], ptr [[NUM_CLOSURE_I26_I]], align 4
; CHECK-NEXT: br label %[[WHILE_COND]]
;
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 6e8a5a1..2889f37 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
declare amdgpu_gfx void @use(...)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index 2d4f748..36e2db0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_no_stack({ptr, i32, <4 x i32>} inreg %a, {ptr, i32, <4 x i32>} %b) {
; GISEL-GFX11-LABEL: amdgpu_cs_chain_preserve_no_stack:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
index ce2b84e..10ffc18 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx900 -early-live-intervals -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx900 -early-live-intervals < %s | FileCheck --check-prefix=GCN %s
define weak_odr amdgpu_kernel void @test_mul24_knownbits_kernel(ptr addrspace(1) %p) #4 {
; GCN-LABEL: test_mul24_knownbits_kernel:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-nsa-threshold.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-nsa-threshold.ll
index b8681a0..4f862ca 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-nsa-threshold.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-nsa-threshold.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=ATTRIB %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefix=FORCE-2 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=3 -verify-machineinstrs < %s | FileCheck -check-prefix=FORCE-3 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=4 -verify-machineinstrs < %s | FileCheck -check-prefix=FORCE-4 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=ATTRIB %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=2 < %s | FileCheck -check-prefix=FORCE-2 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=3 < %s | FileCheck -check-prefix=FORCE-3 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=4 < %s | FileCheck -check-prefix=FORCE-4 %s
; Note: command line argument should override function attribute.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll
index 1af5938..46ca26a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -filetype=obj -o %t.o < %s && llvm-readobj -r %t.o | FileCheck --check-prefix=ELF %s
-; RUN: llc -global-isel -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -filetype=obj -o %t.o < %s && llvm-readobj -r %t.o | FileCheck --check-prefix=ELF %s
; GCN-LABEL: {{^}}ps_main:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
index 91634d8..ad1b78b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}shader_cc:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index a663d45..f4b90b4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc < %s -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-access-mode | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s
-; RUN: llc < %s -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
-; RUN: llc < %s -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-access-mode | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s
-; RUN: llc < %s -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=tonga -mattr=-unaligned-access-mode | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc < %s -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=tonga -mattr=-unaligned-access-mode | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc < %s -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=tonga -mattr=-unaligned-access-mode | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc < %s -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -mtriple=amdgcn | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc < %s -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-access-mode | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s
+; RUN: llc < %s -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -mtriple=amdgcn | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc < %s -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-access-mode | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s
+; RUN: llc < %s -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -mtriple=amdgcn-amdhsa -mcpu=tonga -mattr=-unaligned-access-mode | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc < %s -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -mtriple=amdgcn-amdhsa -mcpu=tonga -mattr=-unaligned-access-mode | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc < %s -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -mtriple=amdgcn-amdhsa -mcpu=tonga -mattr=-unaligned-access-mode | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
; RUN: opt < %s -S -mtriple=amdgcn-unknown-amdhsa -data-layout=A5 -mcpu=kaveri -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector | FileCheck -enable-var-scope -check-prefix=HSAOPT -check-prefix=OPT %s
; RUN: opt < %s -S -mtriple=amdgcn-unknown-unknown -data-layout=A5 -mcpu=kaveri -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector | FileCheck -enable-var-scope -check-prefix=NOHSAOPT -check-prefix=OPT %s
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
index e2510bb..682b78c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI-NOHSA,GCN-NOHSA,FUNC %s
-; RUN: llc -global-isel -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI-NOHSA,GCN-NOHSA,FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefixes=SI-NOHSA,GCN-NOHSA,FUNC %s
+; RUN: llc -global-isel -mtriple=amdgcn < %s | FileCheck --check-prefixes=SI-NOHSA,GCN-NOHSA,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-NOHSA,GCN-NOHSA,FUNC %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-NOHSA,GCN-NOHSA,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=VI-NOHSA,GCN-NOHSA,FUNC %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=VI-NOHSA,GCN-NOHSA,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index f4d17e5..5f98000 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX8 -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX9 -enable-var-scope %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mattr=-xnack -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL,GFX9 -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mattr=-xnack < %s | FileCheck -check-prefixes=GCN,SDAG,GFX8 -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-xnack < %s | FileCheck -check-prefixes=GCN,SDAG,GFX9 -enable-var-scope %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mattr=-xnack -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL,GFX9 -enable-var-scope %s
declare amdgpu_gfx float @extern_func(float) #0
declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
index d06f397..668e950 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
; GCN-LABEL: {{^}}cs_amdpal:
; GCN: .amdgpu_pal_metadata
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
index fce918c..a34d6fa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}es_amdpal:
; GCN: .amdgpu_pal_metadata
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
index 02a2353..c77dbe4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
; GCN-LABEL: {{^}}gs_amdpal:
; GCN: .amdgpu_pal_metadata
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
index 53c6b95..68dfca0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
; GCN-LABEL: {{^}}hs_amdpal:
; GCN: .amdgpu_pal_metadata
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
index 0897489..0a61a67 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}ls_amdpal:
; GCN: .amdgpu_pal_metadata
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll
index 5e21ba4..c917a2d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
; GCN-LABEL: {{^}}cs_amdpal:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll
index dc9a33a..154e1e0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2f0000{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll
index ffce3ed..e16c94c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2c0000{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll
index 3ea3064..cc30461 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xf0000{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll
index bcc8da6..e9090f8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
; GCN-LABEL: {{^}}es_amdpal:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll
index ef4c9cb..58eaa2e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
; GCN-LABEL: {{^}}gs_amdpal:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll
index eb814c1..d02e649 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
; GCN-LABEL: {{^}}hs_amdpal:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
index d4826a2..f8978da 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
@@ -1,13 +1,13 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 -enable-var-scope %s
; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}}
; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf02c0{{$}}
; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}}
-; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x600f0000{{$}}
+; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xe00f0000{{$}}
define amdgpu_cs half @cs_amdpal(half %arg0) #0 {
%add = fadd half %arg0, 1.0
ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll
index 0d81e70..2443c88 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
; GCN-LABEL: {{^}}ls_amdpal:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll
index d31732f..e3603563 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal
; metadata. Check for 0x2c0b (SPI_SHADER_PGM_RSRC2_PS) in pal metadata, and
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll
index 15b1a65..ee0cd3a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
; This pixel shader does not use the result of its interpolation, so it would
; end up with an interpolation mode set in PSAddr but not PSEnable. This test tests
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll
index 42de600..8d34a877 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
; GCN-LABEL: {{^}}vs_amdpal:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-ps.ll b/llvm/test/CodeGen/AMDGPU/amdpal-ps.ll
index 4978c34..a03ea7e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-ps.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-ps.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal
; metadata. Check for 0x2c0b (SPI_SHADER_PGM_RSRC2_PS) in pal metadata, and
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
index a289e04..9395be2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
; This pixel shader does not use the result of its interpolation, so it would
; end up with an interpolation mode set in PSAddr but not PSEnable. This test tests
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll b/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll
index 086a126..3d18f04 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
; We want to make sure that RSRC2 is left untouched
; GCN: '0x2e13 (COMPUTE_PGM_RSRC2)': 0x78a
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
index 7745696..bf83d65 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
; GCN-LABEL: {{^}}vs_amdpal:
; GCN: .amdgpu_pal_metadata
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll b/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll
index 67382d9..346f38a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
; On gfx9 and later, a HS is a merged shader, in which s0-s7 are reserved by the
; hardware, so the PAL puts the GIT (global information table) in s8 rather
diff --git a/llvm/test/CodeGen/AMDGPU/and-gcn.ll b/llvm/test/CodeGen/AMDGPU/and-gcn.ll
index 095c25d..8350b1f 100644
--- a/llvm/test/CodeGen/AMDGPU/and-gcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/and-gcn.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}v_and_i64_br:
; SI: s_and_b64
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index e5fe919..ca1e7c9 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/and_or.ll b/llvm/test/CodeGen/AMDGPU/and_or.ll
index 9e0a787..3fdf1b7 100644
--- a/llvm/test/CodeGen/AMDGPU/and_or.ll
+++ b/llvm/test/CodeGen/AMDGPU/and_or.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
-;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
-;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
-;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
+;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji | FileCheck -check-prefix=VI %s
+;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 | FileCheck -check-prefix=GFX9 %s
+;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 | FileCheck -check-prefix=GFX10 %s
+;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 | FileCheck -check-prefix=GFX10 %s
; ===================================================================================
; V_AND_OR_B32
diff --git a/llvm/test/CodeGen/AMDGPU/andorbitset.ll b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
index a60d14c..52321c8 100644
--- a/llvm/test/CodeGen/AMDGPU/andorbitset.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
define amdgpu_kernel void @s_clear_msb(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_clear_msb:
diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
index 3226a77..e22cee87 100644
--- a/llvm/test/CodeGen/AMDGPU/andorn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GCN %s
; GCN-LABEL: {{^}}scalar_andn2_i32_one_use
; GCN: s_andn2_b32
diff --git a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
index e68a2cd..4195158 100644
--- a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
define amdgpu_kernel void @s_or_to_orn2(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_or_to_orn2:
diff --git a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
index cc9f595..18cf120 100644
--- a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) %arg1) local_unnamed_addr #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 338dd9d..089d6f5 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX9 %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/are-loads-from-same-base-ptr.ll b/llvm/test/CodeGen/AMDGPU/are-loads-from-same-base-ptr.ll
index f15435d..227aff8 100644
--- a/llvm/test/CodeGen/AMDGPU/are-loads-from-same-base-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/are-loads-from-same-base-ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; TII::areLoadsFromSameBasePtr failed because the offset for atomics
; is different from a normal load due to the data operand.
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index e1bbc24..e0a8c55 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=tahiti -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
index a01dc02..e20d242 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
index eaceafc..dc31437 100644
--- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -enable-var-scope --check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
define amdgpu_kernel void @s_ashr_v2i16(ptr addrspace(1) %out, i32, <2 x i16> %lhs, i32, <2 x i16> %rhs) #0 {
; GFX9-LABEL: s_ashr_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index b50112f..45192be 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SICI,SICIVI,PREGFX11,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SICI,CIVI,SICIVI,PREGFX11,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=CIVI,SICIVI,GFX8PLUS,PREGFX11,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9PLUS,GFX8PLUS,PREGFX11,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX9PLUS,GFX8PLUS,GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI,SICI,SICIVI,PREGFX11,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=SICI,CIVI,SICIVI,PREGFX11,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=CIVI,SICIVI,GFX8PLUS,PREGFX11,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9PLUS,GFX8PLUS,PREGFX11,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11,GFX9PLUS,GFX8PLUS,GCN %s
; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
; GFX9PLUS-NOT: m0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll
index 8b026ac..d5b3ee7 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=R600,FUNC %s
; FUNC-LABEL: {{^}}atomic_add_local:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
index 7f45b03..aaedb85 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define i8 @atomic_load_monotonic_i8(ptr addrspace(3) %ptr) {
; CI-LABEL: atomic_load_monotonic_i8:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll
index c188cb12..26d5055 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s
; FUNC-LABEL: {{^}}atomic_sub_local:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 4b68f8a..394727c 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 3ca7db15..4cc39d9 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1,30 +1,30 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_ITERATIVE,GFX1164_ITERATIVE-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-FAKE16,GFX1164_ITERATIVE,GFX1164_ITERATIVE-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-TRUE16,GFX1132_ITERATIVE,GFX1132_ITERATIVE-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-FAKE16,GFX1132_ITERATIVE,GFX1132_ITERATIVE-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_DPP,GFX1164_DPP-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-FAKE16,GFX1164_DPP,GFX1164_DPP-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-TRUE16,GFX1132_DPP,GFX1132_DPP-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-FAKE16,GFX1132_DPP,GFX1132_DPP-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_DPP,GFX1264_DPP-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_DPP,GFX1264_DPP-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_DPP,GFX1232_DPP-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_DPP,GFX1232_DPP-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_ITERATIVE,GFX1164_ITERATIVE-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-FAKE16,GFX1164_ITERATIVE,GFX1164_ITERATIVE-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-TRUE16,GFX1132_ITERATIVE,GFX1132_ITERATIVE-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-FAKE16,GFX1132_ITERATIVE,GFX1132_ITERATIVE-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_DPP,GFX1164_DPP-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-FAKE16,GFX1164_DPP,GFX1164_DPP-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-TRUE16,GFX1132_DPP,GFX1132_DPP-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-FAKE16,GFX1132_DPP,GFX1132_DPP-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_DPP,GFX1264_DPP-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_DPP,GFX1264_DPP-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_DPP,GFX1232_DPP-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_DPP,GFX1232_DPP-FAKE16 %s
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 0c624a8..0f59304 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn-- - -amdgpu-atomic-optimizer-strategy=DPP -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
declare i1 @llvm.amdgcn.wqm.vote(i1)
declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 0a06fe4..e4def28 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index bc0bec4..39a3c9a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
index 9236b40..c2bb4f00 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define void @atomic_store_monotonic_i8(ptr addrspace(3) %ptr, i8 %val) {
; CI-LABEL: atomic_store_monotonic_i8:
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 231f53d..e432399 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1100 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX1100 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
define float @syncscope_system(ptr %addr, float %val) #0 {
; GFX908-LABEL: syncscope_system:
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
index f9a43dd..2cd50b3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
define i32 @atomic_nand_i32_lds(ptr addrspace(3) %ptr) nounwind {
; GCN-LABEL: atomic_nand_i32_lds:
diff --git a/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll
index bc9008c..5b705db 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=atomic-expand \
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a --pass-remarks=atomic-expand \
; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-CAS
; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope
diff --git a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll
index d031326..587157b 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=si-lower \
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a --pass-remarks=si-lower \
; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-HW
; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope agent due to an unsafe request.
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index e74fd21..887f489 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32)
declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32)
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
index d45e116..52d28e5 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
@@ -1,15 +1,15 @@
; -enable-misched=false makes the register usage more predictable
; -regalloc=fast just makes the test run faster
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX90A
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode,+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX11WGP-WAVE32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX11WGP-WAVE64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX11CU-WAVE32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode,+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX11CU-WAVE64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode,+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX11WGP-WAVE32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX11WGP-WAVE64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX11CU-WAVE32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode,+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX11CU-WAVE64
define internal void @use256vgprs() {
%v0 = call i32 asm sideeffect "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 6168674..0a02be9 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=HSAMD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=HSAMD %s
; CHECK-LABEL: {{^}}min_64_max_64:
; CHECK: SGPRBlocks: 0
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index d0107eb..6a1d594 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=ALL %s
; FIXME: Vectorization can increase required SGPR count beyond limit.
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
index a1594a8..81c0f4c 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck %s
@var = addrspace(1) global float 0.0
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index e9fe4f3..41bce31 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck %s
; Exactly 1 wave per execution unit.
; CHECK-LABEL: {{^}}empty_exactly_1:
diff --git a/llvm/test/CodeGen/AMDGPU/attr-unparseable.ll b/llvm/test/CodeGen/AMDGPU/attr-unparseable.ll
index 8eb393f..2145493 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-unparseable.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-unparseable.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s 2>&1 | FileCheck %s
; CHECK: cannot parse integer attribute amdgpu-num-sgpr
define amdgpu_kernel void @unparseable_single_0() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index 7f450ed..b610f11 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -44,13 +44,13 @@ define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(
; GFX9-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0:![0-9]+]]
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(
; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0:![0-9]+]]
; GFX10-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(1) %ptr to ptr
@@ -62,13 +62,13 @@ define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrs
; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(
; GFX9-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0]]
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(
; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0]]
; GFX10-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(1) %ptr to ptr
@@ -110,13 +110,13 @@ define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(
; GFX9-SAME: ptr addrspace(2) [[PTR:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META1:![0-9]+]]
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(
; GFX10-SAME: ptr addrspace(2) [[PTR:%.*]]) #[[ATTR0]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META1:![0-9]+]]
; GFX10-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(2) %ptr to ptr
@@ -128,13 +128,13 @@ define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrs
; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(
; GFX9-SAME: ptr addrspace(2) [[PTR:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META1]]
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(
; GFX10-SAME: ptr addrspace(2) [[PTR:%.*]]) #[[ATTR0]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META1]]
; GFX10-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(2) %ptr to ptr
@@ -176,13 +176,13 @@ define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(
; GFX9-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META2:![0-9]+]]
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(
; GFX10-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META2:![0-9]+]]
; GFX10-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(3) %ptr to ptr
@@ -194,13 +194,13 @@ define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrsp
; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(
; GFX9-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META2]]
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(
; GFX10-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META2]]
; GFX10-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(3) %ptr to ptr
@@ -242,13 +242,13 @@ define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
; GFX9-LABEL: define void @with_constant_to_flat_addrspacecast(
; GFX9-SAME: ptr addrspace(4) [[PTR:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META3:![0-9]+]]
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define void @with_constant_to_flat_addrspacecast(
; GFX10-SAME: ptr addrspace(4) [[PTR:%.*]]) #[[ATTR0]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META3:![0-9]+]]
; GFX10-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(4) %ptr to ptr
@@ -260,13 +260,13 @@ define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr add
; GFX9-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(
; GFX9-SAME: ptr addrspace(4) [[PTR:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META3]]
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(
; GFX10-SAME: ptr addrspace(4) [[PTR:%.*]]) #[[ATTR0]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META3]]
; GFX10-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(4) %ptr to ptr
@@ -308,13 +308,13 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(
; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META4:![0-9]+]]
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(
; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META4:![0-9]+]]
; GFX10-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -326,13 +326,13 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr
; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(
; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META4]]
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(
; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META4]]
; GFX10-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -530,14 +530,14 @@ define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(
; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(
; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META4]]
; GFX9-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]])
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(
; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META4]]
; GFX10-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]])
; GFX10-NEXT: ret void
;
@@ -551,14 +551,14 @@ define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_
; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(
; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META4]]
; GFX9-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]])
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(
; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META4]]
; GFX10-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]])
; GFX10-NEXT: ret void
;
@@ -572,14 +572,14 @@ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5)
; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(
; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META4]]
; GFX9-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]])
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(
; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META4]]
; GFX10-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]])
; GFX10-NEXT: ret void
;
@@ -593,14 +593,14 @@ define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_
; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(
; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META4]]
; GFX9-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]])
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(
; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META4]]
; GFX10-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]])
; GFX10-NEXT: ret void
;
@@ -879,3 +879,15 @@ define amdgpu_kernel void @with_inline_asm() {
; GFX10: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
; GFX10: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
;.
+; GFX9: [[META0]] = !{i32 2, i32 10}
+; GFX9: [[META1]] = !{i32 1, i32 2, i32 3, i32 10}
+; GFX9: [[META2]] = !{i32 1, i32 3, i32 4, i32 10}
+; GFX9: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
+; GFX9: [[META4]] = !{i32 1, i32 5, i32 6, i32 10}
+;.
+; GFX10: [[META0]] = !{i32 2, i32 10}
+; GFX10: [[META1]] = !{i32 1, i32 2, i32 3, i32 10}
+; GFX10: [[META2]] = !{i32 1, i32 3, i32 4, i32 10}
+; GFX10: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
+; GFX10: [[META4]] = !{i32 1, i32 5, i32 6, i32 10}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
index 7ce5a00..d91b2117 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
@@ -514,9 +514,9 @@ define internal void @callee_no_alias_addr_space_select(ptr %ptr1, ptr %ptr2, pt
ret void
}
-define internal void @callee_alias_addr_space_branch(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val, i32 %offset) #0 {
+define internal void @callee_alias_addr_space_branch(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val) #0 {
; CHECK-LABEL: define internal void @callee_alias_addr_space_branch(
-; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: br i1 [[COND1]], label %[[BB_1_TRUE:.*]], label %[[BB_1_FALSE:.*]]
; CHECK: [[BB_1_TRUE]]:
; CHECK-NEXT: br label %[[BB_1_END:.*]]
diff --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
index 7b255a7..b584f6d 100644
--- a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX942-BACKOFF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-back-off-barrier -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-BACKOFF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-back-off-barrier < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-BACKOFF %s
; Subtargets must wait for outstanding memory instructions before a barrier if
; they cannot back off of the barrier.
diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
index bc20665..3706eb5 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@@ -1,9 +1,9 @@
-; RUN: llc -O0 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_branch:
; GCNNOOPT: v_writelane_b32
diff --git a/llvm/test/CodeGen/AMDGPU/basic-call-return.ll b/llvm/test/CodeGen/AMDGPU/basic-call-return.ll
index e47e4c1..9ef5989 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-call-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-call-return.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
define void @void_func_void() #2 {
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/basic-loop.ll b/llvm/test/CodeGen/AMDGPU/basic-loop.ll
index 12821a6..c424a1a 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-loop.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -verify-machineinstrs -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck %s
-; RUN: llc -O0 -verify-machineinstrs -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s
; CHECK-LABEL: {{^}}test_loop:
define amdgpu_kernel void @test_loop(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %val) nounwind {
diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
index 55a560c..d4ef12a 100644
--- a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs --stop-after=regallocfast,2 -o - %s | FileCheck -check-prefix=REGALLOC %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 --stop-after=regallocfast,2 -o - %s | FileCheck -check-prefix=REGALLOC %s
; Test to check if the bb prolog spills are inserted correctly during regalloc.
define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index 5b4866c..752a87a 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX-942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250 %s
; TODO: Add global-isel when it can support bf16
@@ -9,6 +10,11 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) {
; GCN: ; %bb.0:
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: v_test_cvt_bf16_f32_v:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: ; return to shader part epilog
%cvt = fpext bfloat %v to float
ret float %cvt
}
@@ -19,6 +25,13 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) {
; GCN-NEXT: s_lshl_b32 s0, s0, 16
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: v_test_cvt_bf16_f32_s:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_lshl_b32 s0, s0, 16
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; return to shader part epilog
%cvt = fpext bfloat %v to float
ret float %cvt
}
@@ -47,6 +60,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
; GFX-950: ; %bb.0:
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
; GFX-950-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_v:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: ; return to shader part epilog
%res = fptrunc <2 x float> %src to <2 x bfloat>
%cast = bitcast <2 x bfloat> %res to float
ret float %cast
@@ -80,6 +98,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
; GFX-950-NEXT: v_mov_b32_e32 v0, s1
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, s0, v0
; GFX-950-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_s:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, s0, s1
+; GFX1250-NEXT: ; return to shader part epilog
%res = fptrunc <2 x float> %src to <2 x bfloat>
%cast = bitcast <2 x bfloat> %res to float
ret float %cast
@@ -103,6 +126,13 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX-950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX-950-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: v_test_cvt_f32_bf16_v:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: ; return to shader part epilog
%trunc = fptrunc float %src to bfloat
%ext = fpext bfloat %trunc to float
ret float %ext
@@ -172,6 +202,36 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
; GFX-950-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: v_test_cvt_v2f64_v2bf16_v:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_f32_f64_e32 v8, v[2:3]
+; GFX1250-NEXT: v_cvt_f32_f64_e32 v9, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v8
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v9
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[2:3]|, |v[4:5]|
+; GFX1250-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, v[0:1], v[6:7]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, -1, 1, s1
+; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[6:7]|
+; GFX1250-NEXT: v_dual_add_nc_u32 v1, v8, v2 :: v_dual_bitop2_b32 v10, 1, v8 bitop3:0x40
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1
+; GFX1250-NEXT: v_and_b32_e32 v11, 1, v9
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, 1, v10
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_add_nc_u32_e32 v0, v9, v0
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, 1, v11
+; GFX1250-NEXT: s_or_b32 vcc_lo, s1, vcc_lo
+; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX1250-NEXT: s_or_b32 vcc_lo, s2, s0
+; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: ; return to shader part epilog
%res = fptrunc <2 x double> %src to <2 x bfloat>
%cast = bitcast <2 x bfloat> %res to float
ret float %cast
@@ -201,6 +261,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) {
; GFX-950: ; %bb.0: ; %entry
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
; GFX-950-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: ; return to shader part epilog
entry:
%a.cvt = fptrunc float %a to bfloat
%b.cvt = fptrunc float %b to bfloat
@@ -236,6 +301,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) {
; GFX-950: ; %bb.0: ; %entry
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, |v1|
; GFX-950-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, |v1|
+; GFX1250-NEXT: ; return to shader part epilog
entry:
%a.neg = fneg float %a
%a.cvt = fptrunc float %a.neg to bfloat
@@ -269,6 +339,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX-950-NEXT: flat_store_short v[2:3], v0
; GFX-950-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fptrunc_f32_to_bf16:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: flat_store_b16 v[2:3], v0
+; GFX1250-NEXT: s_endpgm
entry:
%a.cvt = fptrunc float %a to bfloat
store bfloat %a.cvt, ptr %out
@@ -298,6 +375,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0
; GFX-950-NEXT: flat_store_short v[2:3], v0
; GFX-950-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fptrunc_f32_to_bf16_abs:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0
+; GFX1250-NEXT: flat_store_b16 v[2:3], v0
+; GFX1250-NEXT: s_endpgm
entry:
%a.abs = call float @llvm.fabs.f32(float %a)
%a.cvt = fptrunc float %a.abs to bfloat
@@ -328,6 +412,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0
; GFX-950-NEXT: flat_store_short v[2:3], v0
; GFX-950-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fptrunc_f32_to_bf16_neg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0
+; GFX1250-NEXT: flat_store_b16 v[2:3], v0
+; GFX1250-NEXT: s_endpgm
entry:
%a.neg = fneg float %a
%a.cvt = fptrunc float %a.neg to bfloat
@@ -373,6 +464,24 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX-950-NEXT: flat_store_short v[2:3], v0
; GFX-950-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fptrunc_f64_to_bf16:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX1250-NEXT: v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]|
+; GFX1250-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s0
+; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s0, 1, v7
+; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
+; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: flat_store_b16 v[2:3], v0
+; GFX1250-NEXT: s_endpgm
entry:
%a.cvt = fptrunc double %a to bfloat
store bfloat %a.cvt, ptr %out
@@ -417,6 +526,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX-950-NEXT: flat_store_short v[2:3], v0
; GFX-950-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fptrunc_f64_to_bf16_neg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_cvt_f32_f64_e64 v6, -v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]|
+; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, -v[0:1], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1
+; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX1250-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
+; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: flat_store_b16 v[2:3], v0
+; GFX1250-NEXT: s_endpgm
entry:
%a.neg = fneg double %a
%a.cvt = fptrunc double %a.neg to bfloat
@@ -462,6 +590,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX-950-NEXT: flat_store_short v[2:3], v0
; GFX-950-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fptrunc_f64_to_bf16_abs:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]|
+; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1
+; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX1250-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
+; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: flat_store_b16 v[2:3], v0
+; GFX1250-NEXT: s_endpgm
entry:
%a.abs = call double @llvm.fabs.f64(double %a)
%a.cvt = fptrunc double %a.abs to bfloat
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
index 029604c..1adf542 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -2,6 +2,385 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
; TODO: Add global-isel when it can support bf16
+define amdgpu_ps void @llvm_sqrt_bf16_v(ptr addrspace(1) %out, bfloat %src) {
+; GCN-LABEL: llvm_sqrt_bf16_v:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_sqrt_bf16_e32 v2, v2
+; GCN-NEXT: global_store_b16 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src)
+ store bfloat %sqrt, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+define amdgpu_ps void @llvm_sqrt_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
+; GCN-LABEL: llvm_sqrt_bf16_s:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_sqrt_bf16_e32 v2, s0
+; GCN-NEXT: global_store_b16 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src)
+ store bfloat %sqrt, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+define amdgpu_ps void @v_test_add_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_test_add_v2bf16_vv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, v3
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fadd <2 x bfloat> %a, %b
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_add_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_add_v2bf16_vs:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fadd <2 x bfloat> %a, %b
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_add_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_add_v2bf16_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, s0, s1
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fadd <2 x bfloat> %a, %b
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_add_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_add_v2bf16_vc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, 2.0 op_sel_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fadd <2 x bfloat> %a, <bfloat 2.0, bfloat 2.0>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_add_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_add_v2bf16_vl:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, 0x42c83f80, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fadd <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_test_sub_v2bf16_vv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> %a, %b
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_sub_v2bf16_vs:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, s0 neg_lo:[0,1] neg_hi:[0,1]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> %a, %b
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_sub_v2bf16_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, s0, s1 neg_lo:[0,1] neg_hi:[0,1]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> %a, %b
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_sub_v2bf16_vc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, -2.0 op_sel_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> %a, <bfloat 2.0, bfloat 2.0>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_sub_v2bf16_vl:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, 0xc2c8bf80, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_lv(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_sub_v2bf16_lv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, 0x42c83f80, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> <bfloat 1.0, bfloat 100.0>, %a
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_iv(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_sub_v2bf16_iv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, 1.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> <bfloat 1.0, bfloat 1.0>, %a
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_test_mul_v2bf16_vv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul <2 x bfloat> %a, %b
+ store <2 x bfloat> %mul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_mul_v2bf16_vs:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul <2 x bfloat> %a, %b
+ store <2 x bfloat> %mul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_mul_v2bf16_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul <2 x bfloat> %a, %b
+ store <2 x bfloat> %mul, ptr addrspace(1) %out
+ ret void
+}
+
+; FIXME: We can do better folding inline constant instead of a literal.
+
+define amdgpu_ps void @v_test_mul_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_mul_v2bf16_vc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul <2 x bfloat> %a, <bfloat 0.5, bfloat 0.5>
+ store <2 x bfloat> %mul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_mul_v2bf16_vl:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
+ store <2 x bfloat> %mul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_min_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_test_min_v2bf16_vv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_min_num_bf16 v2, v2, v3
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %min = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ store <2 x bfloat> %min, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_min_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_min_v2bf16_vs:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_min_num_bf16 v2, v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %min = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ store <2 x bfloat> %min, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_min_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_min_v2bf16_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_min_num_bf16 v2, s0, s1
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %min = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ store <2 x bfloat> %min, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_min_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_min_v2bf16_vc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_min_num_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %min = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 0.5, bfloat 0.5>)
+ store <2 x bfloat> %min, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_min_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_min_v2bf16_vl:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_min_num_bf16 v2, 0x42c83f80, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %min = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 1.0, bfloat 100.0>)
+ store <2 x bfloat> %min, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_max_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_test_max_v2bf16_vv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v2, v2, v3
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ store <2 x bfloat> %max, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_max_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_max_v2bf16_vs:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v2, v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ store <2 x bfloat> %max, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_max_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_max_v2bf16_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v2, s0, s1
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ store <2 x bfloat> %max, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_max_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_max_v2bf16_vc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 0.5, bfloat 0.5>)
+ store <2 x bfloat> %max, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_max_v2bf16_vl:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v2, 0x42c83f80, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 1.0, bfloat 100.0>)
+ store <2 x bfloat> %max, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+; GCN-LABEL: v_test_fma_v2bf16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, v3, v4
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %fma = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+ store <2 x bfloat> %fma, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_fma_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
+; GCN-LABEL: v_test_fma_v2bf16_vss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, s1
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %fma = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+ store <2 x bfloat> %fma, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_fma_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
+; GCN-LABEL: v_test_fma_v2bf16_sss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_fma_bf16 v2, s0, s1, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %fma = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+ store <2 x bfloat> %fma, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_fma_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_fma_v2bf16_vsc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %fma = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> <bfloat 0.5, bfloat 0.5>)
+ store <2 x bfloat> %fma, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_fma_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_fma_v2bf16_vll:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s0, 0x42c83f80
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0x43484000
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %fma = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 1.0, bfloat 100.0>, <2 x bfloat> <bfloat 2.0, bfloat 200.0>)
+ store <2 x bfloat> %fma, ptr addrspace(1) %out
+ ret void
+}
define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
; GCN-LABEL: llvm_log2_bf16_v:
@@ -47,5 +426,9 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
ret void
}
+declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
+declare bfloat @llvm.sqrt.bf16(bfloat)
declare bfloat @llvm.log2.bf16(bfloat)
declare bfloat @llvm.exp2.bf16(bfloat)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index cd6d741..7859fcdf 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2,7 +2,8 @@
; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9,GFX900
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 | FileCheck %s -check-prefixes=GFX9,GFX950
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
@@ -967,12 +968,21 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_store_global_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v[1:2], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_store_global_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_store_dword v[1:2], v0, off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_store_global_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-NEXT: global_store_dword v[2:3], v0, off
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_store_global_v2bf16:
; GFX10: ; %bb.0:
@@ -2019,23 +2029,41 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_store_global_v64bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_store_global_v64bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX900-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX900-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX900-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX900-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX900-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX900-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX900-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_store_global_v64bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:8
+; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GFX950-NEXT: scratch_load_dword v31, off, s32
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX950-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX950-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX950-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX950-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX950-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX950-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX950-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_store_global_v64bf16:
; GFX10: ; %bb.0:
@@ -2204,20 +2232,30 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_load_store_f32_to_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
-; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_load_store_f32_to_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_load_dword v0, v[0:1], off
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
+; GFX900-NEXT: global_store_short_d16_hi v[2:3], v0, off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_load_store_f32_to_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dword v0, v[0:1], off
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: global_store_short v[2:3], v0, off
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_load_store_f32_to_bf16:
; GFX10: ; %bb.0:
@@ -2308,30 +2346,50 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_load_store_f64_to_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_movk_i32 s8, 0x7fff
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX9-NEXT: v_and_b32_e32 v7, 1, v6
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7
-; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]|
-; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
-; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
-; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT: v_add3_u32 v4, v5, v4, s8
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
-; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_load_store_f64_to_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX900-NEXT: s_movk_i32 s8, 0x7fff
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX900-NEXT: v_and_b32_e32 v7, 1, v6
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7
+; GFX900-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]|
+; GFX900-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
+; GFX900-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; GFX900-NEXT: v_add_u32_e32 v4, v6, v4
+; GFX900-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT: v_add3_u32 v4, v5, v4, s8
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
+; GFX900-NEXT: global_store_short_d16_hi v[2:3], v0, off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_load_store_f64_to_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX950-NEXT: v_and_b32_e32 v7, 1, v6
+; GFX950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
+; GFX950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
+; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
+; GFX950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
+; GFX950-NEXT: v_add_u32_e32 v0, v6, v0
+; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: global_store_short v[2:3], v0, off
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_load_store_f64_to_bf16:
; GFX10: ; %bb.0:
@@ -2858,12 +2916,21 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_arg_store:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_short v[1:2], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_arg_store:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_store_short v[1:2], v0, off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_arg_store:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-NEXT: global_store_short v[2:3], v0, off
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_arg_store:
; GFX10: ; %bb.0:
@@ -2918,12 +2985,21 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_arg_store_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v[1:2], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_arg_store_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_store_dword v[1:2], v0, off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_arg_store_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-NEXT: global_store_dword v[2:3], v0, off
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_arg_store_v2bf16:
; GFX10: ; %bb.0:
@@ -3384,12 +3460,19 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_byval:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_byval:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: buffer_store_short v0, off, s[0:3], s32
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_byval:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_store_short off, v0, s32
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_byval:
; GFX10: ; %bb.0:
@@ -3440,12 +3523,19 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_sret:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_sret:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_sret:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_store_short v0, v1, off
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_sret:
; GFX10: ; %bb.0:
@@ -3907,34 +3997,63 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_call:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[16:17]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT: v_writelane_b32 v2, s30, 0
-; GFX9-NEXT: v_writelane_b32 v2, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v2, 1
-; GFX9-NEXT: v_readlane_b32 s30, v2, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_mov_b32 s33, s18
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_call:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: s_mov_b32 s18, s33
+; GFX900-NEXT: s_mov_b32 s33, s32
+; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX900-NEXT: s_mov_b64 exec, s[16:17]
+; GFX900-NEXT: s_addk_i32 s32, 0x400
+; GFX900-NEXT: s_getpc_b64 s[16:17]
+; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
+; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
+; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX900-NEXT: v_writelane_b32 v2, s30, 0
+; GFX900-NEXT: v_writelane_b32 v2, s31, 1
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_readlane_b32 s31, v2, 1
+; GFX900-NEXT: v_readlane_b32 s30, v2, 0
+; GFX900-NEXT: s_mov_b32 s32, s33
+; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: s_mov_b32 s33, s18
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_call:
+; GFX950: ; %bb.0: ; %entry
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: s_mov_b32 s2, s33
+; GFX950-NEXT: s_mov_b32 s33, s32
+; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT: scratch_store_dword off, v4, s33 ; 4-byte Folded Spill
+; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: s_add_i32 s32, s32, 16
+; GFX950-NEXT: s_getpc_b64 s[0:1]
+; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store@gotpcrel32@lo+4
+; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store@gotpcrel32@hi+12
+; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX950-NEXT: v_writelane_b32 v4, s30, 0
+; GFX950-NEXT: v_writelane_b32 v4, s31, 1
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: scratch_store_short v1, v0, off sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_readlane_b32 s31, v4, 1
+; GFX950-NEXT: v_readlane_b32 s30, v4, 0
+; GFX950-NEXT: s_mov_b32 s32, s33
+; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
+; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: s_mov_b32 s33, s2
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call:
; GFX10: ; %bb.0: ; %entry
@@ -4104,34 +4223,63 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_call_v2bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[16:17]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT: v_writelane_b32 v2, s30, 0
-; GFX9-NEXT: v_writelane_b32 v2, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v2, 1
-; GFX9-NEXT: v_readlane_b32 s30, v2, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_mov_b32 s33, s18
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_call_v2bf16:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: s_mov_b32 s18, s33
+; GFX900-NEXT: s_mov_b32 s33, s32
+; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX900-NEXT: s_mov_b64 exec, s[16:17]
+; GFX900-NEXT: s_addk_i32 s32, 0x400
+; GFX900-NEXT: s_getpc_b64 s[16:17]
+; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX900-NEXT: v_writelane_b32 v2, s30, 0
+; GFX900-NEXT: v_writelane_b32 v2, s31, 1
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_readlane_b32 s31, v2, 1
+; GFX900-NEXT: v_readlane_b32 s30, v2, 0
+; GFX900-NEXT: s_mov_b32 s32, s33
+; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: s_mov_b32 s33, s18
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_call_v2bf16:
+; GFX950: ; %bb.0: ; %entry
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: s_mov_b32 s2, s33
+; GFX950-NEXT: s_mov_b32 s33, s32
+; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT: scratch_store_dword off, v4, s33 ; 4-byte Folded Spill
+; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: s_add_i32 s32, s32, 16
+; GFX950-NEXT: s_getpc_b64 s[0:1]
+; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX950-NEXT: v_writelane_b32 v4, s30, 0
+; GFX950-NEXT: v_writelane_b32 v4, s31, 1
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: scratch_store_dword v1, v0, off sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_readlane_b32 s31, v4, 1
+; GFX950-NEXT: v_readlane_b32 s30, v4, 0
+; GFX950-NEXT: s_mov_b32 s32, s33
+; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
+; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: s_mov_b32 s33, s2
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_v2bf16:
; GFX10: ; %bb.0: ; %entry
@@ -4308,36 +4456,68 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_call_v3bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[16:17]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT: v_writelane_b32 v3, s30, 0
-; GFX9-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v3, 1
-; GFX9-NEXT: v_readlane_b32 s30, v3, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_mov_b32 s33, s18
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_call_v3bf16:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: s_mov_b32 s18, s33
+; GFX900-NEXT: s_mov_b32 s33, s32
+; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX900-NEXT: s_mov_b64 exec, s[16:17]
+; GFX900-NEXT: s_addk_i32 s32, 0x400
+; GFX900-NEXT: s_getpc_b64 s[16:17]
+; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX900-NEXT: v_writelane_b32 v3, s30, 0
+; GFX900-NEXT: v_writelane_b32 v3, s31, 1
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_readlane_b32 s31, v3, 1
+; GFX900-NEXT: v_readlane_b32 s30, v3, 0
+; GFX900-NEXT: s_mov_b32 s32, s33
+; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: s_mov_b32 s33, s18
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_call_v3bf16:
+; GFX950: ; %bb.0: ; %entry
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: s_mov_b32 s2, s33
+; GFX950-NEXT: s_mov_b32 s33, s32
+; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill
+; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: s_add_i32 s32, s32, 16
+; GFX950-NEXT: s_getpc_b64 s[0:1]
+; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX950-NEXT: v_writelane_b32 v5, s30, 0
+; GFX950-NEXT: v_writelane_b32 v5, s31, 1
+; GFX950-NEXT: v_mov_b32_e32 v4, v2
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: scratch_store_short v4, v1, off offset:4 sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: scratch_store_dword v4, v0, off sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_readlane_b32 s31, v5, 1
+; GFX950-NEXT: v_readlane_b32 s30, v5, 0
+; GFX950-NEXT: s_mov_b32 s32, s33
+; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
+; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: s_mov_b32 s33, s2
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_v3bf16:
; GFX10: ; %bb.0: ; %entry
@@ -4534,36 +4714,66 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_call_v4bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[16:17]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT: v_writelane_b32 v3, s30, 0
-; GFX9-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v3, 1
-; GFX9-NEXT: v_readlane_b32 s30, v3, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_mov_b32 s33, s18
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_call_v4bf16:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: s_mov_b32 s18, s33
+; GFX900-NEXT: s_mov_b32 s33, s32
+; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX900-NEXT: s_mov_b64 exec, s[16:17]
+; GFX900-NEXT: s_addk_i32 s32, 0x400
+; GFX900-NEXT: s_getpc_b64 s[16:17]
+; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX900-NEXT: v_writelane_b32 v3, s30, 0
+; GFX900-NEXT: v_writelane_b32 v3, s31, 1
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_readlane_b32 s31, v3, 1
+; GFX900-NEXT: v_readlane_b32 s30, v3, 0
+; GFX900-NEXT: s_mov_b32 s32, s33
+; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: s_mov_b32 s33, s18
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_call_v4bf16:
+; GFX950: ; %bb.0: ; %entry
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: s_mov_b32 s2, s33
+; GFX950-NEXT: s_mov_b32 s33, s32
+; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill
+; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: s_add_i32 s32, s32, 16
+; GFX950-NEXT: s_getpc_b64 s[0:1]
+; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX950-NEXT: v_writelane_b32 v5, s30, 0
+; GFX950-NEXT: v_writelane_b32 v5, s31, 1
+; GFX950-NEXT: v_mov_b32_e32 v4, v2
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_readlane_b32 s31, v5, 1
+; GFX950-NEXT: v_readlane_b32 s30, v5, 0
+; GFX950-NEXT: s_mov_b32 s32, s33
+; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
+; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: s_mov_b32 s33, s2
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_v4bf16:
; GFX10: ; %bb.0: ; %entry
@@ -4804,40 +5014,69 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_call_v8bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[16:17]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT: v_writelane_b32 v5, s30, 0
-; GFX9-NEXT: v_writelane_b32 v5, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v5, 1
-; GFX9-NEXT: v_readlane_b32 s30, v5, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_mov_b32 s33, s18
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_call_v8bf16:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: s_mov_b32 s18, s33
+; GFX900-NEXT: s_mov_b32 s33, s32
+; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GFX900-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX900-NEXT: s_mov_b64 exec, s[16:17]
+; GFX900-NEXT: s_addk_i32 s32, 0x400
+; GFX900-NEXT: s_getpc_b64 s[16:17]
+; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX900-NEXT: v_writelane_b32 v5, s30, 0
+; GFX900-NEXT: v_writelane_b32 v5, s31, 1
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_readlane_b32 s31, v5, 1
+; GFX900-NEXT: v_readlane_b32 s30, v5, 0
+; GFX900-NEXT: s_mov_b32 s32, s33
+; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: s_mov_b32 s33, s18
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_call_v8bf16:
+; GFX950: ; %bb.0: ; %entry
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: s_mov_b32 s2, s33
+; GFX950-NEXT: s_mov_b32 s33, s32
+; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill
+; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: s_add_i32 s32, s32, 16
+; GFX950-NEXT: s_getpc_b64 s[0:1]
+; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX950-NEXT: v_writelane_b32 v5, s30, 0
+; GFX950-NEXT: v_writelane_b32 v5, s31, 1
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: scratch_store_dwordx4 v4, v[0:3], off sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_readlane_b32 s31, v5, 1
+; GFX950-NEXT: v_readlane_b32 s30, v5, 0
+; GFX950-NEXT: s_mov_b32 s32, s33
+; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
+; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: s_mov_b32 s33, s2
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_v8bf16:
; GFX10: ; %bb.0: ; %entry
@@ -5174,48 +5413,79 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_call_v16bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[16:17]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT: v_writelane_b32 v9, s30, 0
-; GFX9-NEXT: v_writelane_b32 v9, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v9, 1
-; GFX9-NEXT: v_readlane_b32 s30, v9, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_mov_b32 s33, s18
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_call_v16bf16:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: s_mov_b32 s18, s33
+; GFX900-NEXT: s_mov_b32 s33, s32
+; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GFX900-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX900-NEXT: s_mov_b64 exec, s[16:17]
+; GFX900-NEXT: s_addk_i32 s32, 0x400
+; GFX900-NEXT: s_getpc_b64 s[16:17]
+; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX900-NEXT: v_writelane_b32 v9, s30, 0
+; GFX900-NEXT: v_writelane_b32 v9, s31, 1
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_readlane_b32 s31, v9, 1
+; GFX900-NEXT: v_readlane_b32 s30, v9, 0
+; GFX900-NEXT: s_mov_b32 s32, s33
+; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: s_mov_b32 s33, s18
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_call_v16bf16:
+; GFX950: ; %bb.0: ; %entry
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: s_mov_b32 s2, s33
+; GFX950-NEXT: s_mov_b32 s33, s32
+; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT: scratch_store_dword off, v9, s33 ; 4-byte Folded Spill
+; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: s_add_i32 s32, s32, 16
+; GFX950-NEXT: s_getpc_b64 s[0:1]
+; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX950-NEXT: v_writelane_b32 v9, s30, 0
+; GFX950-NEXT: v_writelane_b32 v9, s31, 1
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT: scratch_store_dwordx4 v8, v[4:7], off offset:16 sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: scratch_store_dwordx4 v8, v[0:3], off sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_readlane_b32 s31, v9, 1
+; GFX950-NEXT: v_readlane_b32 s30, v9, 0
+; GFX950-NEXT: s_mov_b32 s32, s33
+; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT: scratch_load_dword v9, off, s33 ; 4-byte Folded Reload
+; GFX950-NEXT: s_mov_b64 exec, s[0:1]
+; GFX950-NEXT: s_mov_b32 s33, s2
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_v16bf16:
; GFX10: ; %bb.0: ; %entry
@@ -5332,14 +5602,23 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_alloca_load_store_ret:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_alloca_load_store_ret:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: buffer_store_short v0, off, s[0:3], s32
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_alloca_load_store_ret:
+; GFX950: ; %bb.0: ; %entry
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_store_short off, v0, s32 sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: scratch_load_ushort v0, off, s32 sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_alloca_load_store_ret:
; GFX10: ; %bb.0: ; %entry
@@ -5625,52 +5904,72 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: test_overflow_stack:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(25)
-; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: s_waitcnt vmcnt(25)
-; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: s_waitcnt vmcnt(25)
-; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_overflow_stack:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
+; GFX900-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
+; GFX900-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
+; GFX900-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
+; GFX900-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
+; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
+; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4
+; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
+; GFX900-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
+; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
+; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
+; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
+; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
+; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
+; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
+; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
+; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
+; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
+; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
+; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
+; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
+; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX900-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX900-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX900-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX900-NEXT: s_waitcnt vmcnt(25)
+; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:124
+; GFX900-NEXT: s_waitcnt vmcnt(25)
+; GFX900-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120
+; GFX900-NEXT: s_waitcnt vmcnt(25)
+; GFX900-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:116
+; GFX900-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_overflow_stack:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:8
+; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GFX950-NEXT: scratch_load_dword v31, off, s32
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[26:29], off offset:96
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[22:25], off offset:80
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[18:21], off offset:64
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:48
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[2:5], off
+; GFX950-NEXT: s_waitcnt vmcnt(7)
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[30:33], off offset:112
+; GFX950-NEXT: scratch_store_short v0, v1, off offset:128
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_overflow_stack:
; GFX10: ; %bb.0:
@@ -5870,15 +6169,25 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: global_extload_v3bf16_to_v3f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v3bf16_to_v3f32:
; GFX10: ; %bb.0:
@@ -6120,18 +6429,31 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: global_extload_v6bf16_to_v6f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx3 v[3:5], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx3 v[3:5], v[0:1], off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx3 v[4:6], v[0:1], off
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v6bf16_to_v6f32:
; GFX10: ; %bb.0:
@@ -6766,16 +7088,27 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: global_extload_v2bf16_to_v2f64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_load_dword v2, v[0:1], off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dword v0, v[0:1], off
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v2bf16_to_v2f64:
; GFX10: ; %bb.0:
@@ -6852,18 +7185,31 @@ define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: global_extload_v3bf16_to_v3f64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v3bf16_to_v3f64:
; GFX10: ; %bb.0:
@@ -8476,193 +8822,363 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: global_extload_v32bf16_to_v32f64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v9, v[1:2], off offset:62
-; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:60
-; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:58
-; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:56
-; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:54
-; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:52
-; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:50
-; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:48
-; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:46
-; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:44
-; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:42
-; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:40
-; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:38
-; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:36
-; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:34
-; GFX9-NEXT: global_load_ushort v25, v[1:2], off offset:32
-; GFX9-NEXT: global_load_ushort v26, v[1:2], off
-; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:2
-; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16
-; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18
-; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20
-; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22
-; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:24
-; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:30
-; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:26
-; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:28
-; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:4
-; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:6
-; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:8
-; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:10
-; GFX9-NEXT: global_load_ushort v7, v[1:2], off offset:12
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14
-; GFX9-NEXT: s_waitcnt vmcnt(31)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v9
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
-; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v11
-; GFX9-NEXT: s_waitcnt vmcnt(28)
-; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:252
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:248
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v12
-; GFX9-NEXT: s_waitcnt vmcnt(29)
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:244
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:240
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
-; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:236
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:232
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v11
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v12
-; GFX9-NEXT: s_waitcnt vmcnt(31)
-; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16
-; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:228
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v13
-; GFX9-NEXT: s_waitcnt vmcnt(31)
-; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v14
-; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v15
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[15:16], v16
-; GFX9-NEXT: s_waitcnt vmcnt(32)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v19
-; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21
-; GFX9-NEXT: s_waitcnt vmcnt(28)
-; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v23
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v20
-; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22
-; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204
-; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:200
-; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:196
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:192
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v21
-; GFX9-NEXT: s_waitcnt vmcnt(33)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v24
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v19
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v20
-; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:188
-; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:184
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:180
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:176
-; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:172
-; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:168
-; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:164
-; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:160
-; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:156
-; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152
-; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:148
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:144
-; GFX9-NEXT: s_waitcnt vmcnt(44)
-; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v25
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:140
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:136
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v11
-; GFX9-NEXT: s_waitcnt vmcnt(38)
-; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v28
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:132
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v13
-; GFX9-NEXT: s_waitcnt vmcnt(38)
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v30
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v15
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v29
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v17
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v26
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v27
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v2
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v31
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[15:16], v2
-; GFX9-NEXT: s_waitcnt vmcnt(40)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v32
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[17:18], v2
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v33
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v2
-; GFX9-NEXT: s_waitcnt vmcnt(40)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v34
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v2
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX9-NEXT: s_waitcnt vmcnt(40)
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v3
-; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v10
-; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_load_ushort v9, v[1:2], off offset:62
+; GFX900-NEXT: global_load_ushort v11, v[1:2], off offset:60
+; GFX900-NEXT: global_load_ushort v12, v[1:2], off offset:58
+; GFX900-NEXT: global_load_ushort v13, v[1:2], off offset:56
+; GFX900-NEXT: global_load_ushort v14, v[1:2], off offset:54
+; GFX900-NEXT: global_load_ushort v15, v[1:2], off offset:52
+; GFX900-NEXT: global_load_ushort v16, v[1:2], off offset:50
+; GFX900-NEXT: global_load_ushort v17, v[1:2], off offset:48
+; GFX900-NEXT: global_load_ushort v18, v[1:2], off offset:46
+; GFX900-NEXT: global_load_ushort v19, v[1:2], off offset:44
+; GFX900-NEXT: global_load_ushort v20, v[1:2], off offset:42
+; GFX900-NEXT: global_load_ushort v21, v[1:2], off offset:40
+; GFX900-NEXT: global_load_ushort v22, v[1:2], off offset:38
+; GFX900-NEXT: global_load_ushort v23, v[1:2], off offset:36
+; GFX900-NEXT: global_load_ushort v24, v[1:2], off offset:34
+; GFX900-NEXT: global_load_ushort v25, v[1:2], off offset:32
+; GFX900-NEXT: global_load_ushort v26, v[1:2], off
+; GFX900-NEXT: global_load_ushort v27, v[1:2], off offset:2
+; GFX900-NEXT: global_load_ushort v3, v[1:2], off offset:16
+; GFX900-NEXT: global_load_ushort v4, v[1:2], off offset:18
+; GFX900-NEXT: global_load_ushort v5, v[1:2], off offset:20
+; GFX900-NEXT: global_load_ushort v6, v[1:2], off offset:22
+; GFX900-NEXT: global_load_ushort v8, v[1:2], off offset:24
+; GFX900-NEXT: global_load_ushort v28, v[1:2], off offset:30
+; GFX900-NEXT: global_load_ushort v29, v[1:2], off offset:26
+; GFX900-NEXT: global_load_ushort v30, v[1:2], off offset:28
+; GFX900-NEXT: global_load_ushort v31, v[1:2], off offset:4
+; GFX900-NEXT: global_load_ushort v32, v[1:2], off offset:6
+; GFX900-NEXT: global_load_ushort v33, v[1:2], off offset:8
+; GFX900-NEXT: global_load_ushort v34, v[1:2], off offset:10
+; GFX900-NEXT: global_load_ushort v7, v[1:2], off offset:12
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: global_load_ushort v1, v[1:2], off offset:14
+; GFX900-NEXT: s_waitcnt vmcnt(31)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v9
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
+; GFX900-NEXT: s_waitcnt vmcnt(30)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v11
+; GFX900-NEXT: s_waitcnt vmcnt(28)
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v13
+; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:252
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:248
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v12
+; GFX900-NEXT: s_waitcnt vmcnt(29)
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v14
+; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:244
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:240
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
+; GFX900-NEXT: s_waitcnt vmcnt(30)
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v15
+; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:236
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:232
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v11
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v12
+; GFX900-NEXT: s_waitcnt vmcnt(31)
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v16
+; GFX900-NEXT: s_waitcnt vmcnt(30)
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v17
+; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:228
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v13
+; GFX900-NEXT: s_waitcnt vmcnt(31)
+; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v18
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v14
+; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220
+; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v15
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v16
+; GFX900-NEXT: s_waitcnt vmcnt(32)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v19
+; GFX900-NEXT: s_waitcnt vmcnt(30)
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v21
+; GFX900-NEXT: s_waitcnt vmcnt(28)
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v23
+; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v20
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v22
+; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204
+; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:200
+; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:196
+; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:192
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v21
+; GFX900-NEXT: s_waitcnt vmcnt(33)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v24
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v19
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v20
+; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:188
+; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:184
+; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:180
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:176
+; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:172
+; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:168
+; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:164
+; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:160
+; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:156
+; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152
+; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:148
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
+; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:144
+; GFX900-NEXT: s_waitcnt vmcnt(44)
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v25
+; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:140
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:136
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v11
+; GFX900-NEXT: s_waitcnt vmcnt(38)
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v28
+; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:132
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v13
+; GFX900-NEXT: s_waitcnt vmcnt(38)
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v30
+; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:120
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v15
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v29
+; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:116
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:112
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v17
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v26
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v27
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v2
+; GFX900-NEXT: s_waitcnt vmcnt(41)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v31
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v2
+; GFX900-NEXT: s_waitcnt vmcnt(40)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v32
+; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:108
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:104
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v2
+; GFX900-NEXT: s_waitcnt vmcnt(41)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v33
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v2
+; GFX900-NEXT: s_waitcnt vmcnt(40)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v34
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
+; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[5:6], v2
+; GFX900-NEXT: s_waitcnt vmcnt(41)
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v7
+; GFX900-NEXT: s_waitcnt vmcnt(40)
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92
+; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84
+; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[6:7], v3
+; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76
+; GFX900-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
+; GFX900-NEXT: v_cvt_f64_f32_e32 v[1:2], v10
+; GFX900-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68
+; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64
+; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60
+; GFX900-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56
+; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52
+; GFX900-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
+; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:44
+; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40
+; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:36
+; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:32
+; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28
+; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24
+; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20
+; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
+; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12
+; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8
+; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:4
+; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX950-NEXT: global_load_ushort v1, v[2:3], off offset:2
+; GFX950-NEXT: global_load_ushort v4, v[2:3], off offset:12
+; GFX950-NEXT: global_load_ushort v5, v[2:3], off offset:8
+; GFX950-NEXT: global_load_ushort v6, v[2:3], off offset:4
+; GFX950-NEXT: global_load_ushort v7, v[2:3], off
+; GFX950-NEXT: global_load_ushort v8, v[2:3], off offset:6
+; GFX950-NEXT: global_load_ushort v9, v[2:3], off offset:10
+; GFX950-NEXT: global_load_ushort v10, v[2:3], off offset:14
+; GFX950-NEXT: global_load_ushort v11, v[2:3], off offset:18
+; GFX950-NEXT: global_load_ushort v12, v[2:3], off offset:28
+; GFX950-NEXT: global_load_ushort v13, v[2:3], off offset:24
+; GFX950-NEXT: global_load_ushort v14, v[2:3], off offset:20
+; GFX950-NEXT: global_load_ushort v15, v[2:3], off offset:16
+; GFX950-NEXT: global_load_ushort v16, v[2:3], off offset:22
+; GFX950-NEXT: global_load_ushort v17, v[2:3], off offset:26
+; GFX950-NEXT: global_load_ushort v18, v[2:3], off offset:30
+; GFX950-NEXT: global_load_ushort v19, v[2:3], off offset:34
+; GFX950-NEXT: global_load_ushort v20, v[2:3], off offset:44
+; GFX950-NEXT: global_load_ushort v21, v[2:3], off offset:40
+; GFX950-NEXT: global_load_ushort v22, v[2:3], off offset:36
+; GFX950-NEXT: global_load_ushort v23, v[2:3], off offset:32
+; GFX950-NEXT: global_load_ushort v24, v[2:3], off offset:38
+; GFX950-NEXT: global_load_ushort v25, v[2:3], off offset:42
+; GFX950-NEXT: global_load_ushort v26, v[2:3], off offset:46
+; GFX950-NEXT: global_load_ushort v42, v[2:3], off offset:50
+; GFX950-NEXT: global_load_ushort v43, v[2:3], off offset:62
+; GFX950-NEXT: global_load_ushort v46, v[2:3], off offset:60
+; GFX950-NEXT: global_load_ushort v47, v[2:3], off offset:56
+; GFX950-NEXT: global_load_ushort v60, v[2:3], off offset:52
+; GFX950-NEXT: global_load_ushort v56, v[2:3], off offset:48
+; GFX950-NEXT: global_load_ushort v57, v[2:3], off offset:54
+; GFX950-NEXT: global_load_ushort v58, v[2:3], off offset:58
+; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX950-NEXT: s_waitcnt vmcnt(31)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: s_waitcnt vmcnt(30)
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v4
+; GFX950-NEXT: s_waitcnt vmcnt(29)
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v5
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v1
+; GFX950-NEXT: s_waitcnt vmcnt(27)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX950-NEXT: s_waitcnt vmcnt(26)
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GFX950-NEXT: s_waitcnt vmcnt(25)
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v9
+; GFX950-NEXT: s_waitcnt vmcnt(24)
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v10
+; GFX950-NEXT: s_waitcnt vmcnt(23)
+; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v11
+; GFX950-NEXT: s_waitcnt vmcnt(22)
+; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX950-NEXT: s_waitcnt vmcnt(21)
+; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX950-NEXT: s_waitcnt vmcnt(20)
+; GFX950-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX950-NEXT: s_waitcnt vmcnt(19)
+; GFX950-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX950-NEXT: s_waitcnt vmcnt(18)
+; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v16
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[12:13], v27
+; GFX950-NEXT: s_waitcnt vmcnt(16)
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v18
+; GFX950-NEXT: s_waitcnt vmcnt(15)
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v19
+; GFX950-NEXT: s_waitcnt vmcnt(14)
+; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v20
+; GFX950-NEXT: s_waitcnt vmcnt(13)
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v21
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[14:15], v30
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[20:21], v31
+; GFX950-NEXT: s_waitcnt vmcnt(10)
+; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v24
+; GFX950-NEXT: s_waitcnt vmcnt(9)
+; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v25
+; GFX950-NEXT: s_waitcnt vmcnt(8)
+; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v26
+; GFX950-NEXT: s_waitcnt vmcnt(7)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v42
+; GFX950-NEXT: s_waitcnt vmcnt(6)
+; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v43
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[18:19], v32
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[24:25], v33
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[26:27], v36
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[32:33], v37
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[30:31], v38
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[36:37], v39
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[38:39], v44
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v42
+; GFX950-NEXT: s_waitcnt vmcnt(5)
+; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v46
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v42
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v58
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:240
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v46
+; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v47
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v56
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v57
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v46
+; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v17
+; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v22
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:224
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[10:11], v28
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v60
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[16:17], v29
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[22:23], v34
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[28:29], v35
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[34:35], v48
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[50:51], v49
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[48:49], v52
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[54:55], v53
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[52:53], v40
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[40:41], v41
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v1
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[8:9], v7
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:208
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:192
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[38:41], off offset:176
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[52:55], off offset:160
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[48:51], off offset:144
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[34:37], off offset:128
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[30:33], off offset:112
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[26:29], off offset:96
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[22:25], off offset:80
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[18:21], off offset:64
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:48
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16
+; GFX950-NEXT: scratch_store_dwordx4 v0, v[2:5], off
+; GFX950-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v32bf16_to_v32f64:
; GFX10: ; %bb.0:
@@ -9050,20 +9566,29 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fadd_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_bf16:
; GFX10: ; %bb.0:
@@ -9178,29 +9703,41 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fadd_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT: v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_add_f32_e32 v2, v3, v2
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_add_f32_e32 v2, v3, v2
+; GFX950-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v2bf16:
; GFX10: ; %bb.0:
@@ -9363,38 +9900,54 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fadd_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX950-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v3bf16:
; GFX10: ; %bb.0:
@@ -9604,46 +10157,65 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fadd_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_add_f32_e32 v3, v5, v3
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX950-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_add_f32_e32 v3, v5, v3
+; GFX950-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v4bf16:
; GFX10: ; %bb.0:
@@ -9967,80 +10539,113 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fadd_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT: v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
-; GFX9-NEXT: v_add_f32_e32 v6, v9, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX9-NEXT: v_add_f32_e32 v5, v9, v5
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
-; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_v8bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX900-NEXT: v_add_f32_e32 v8, v9, v8
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX900-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX900-NEXT: v_add_f32_e32 v6, v9, v6
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX900-NEXT: v_add_f32_e32 v5, v9, v5
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4
+; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_v8bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_add_f32_e32 v8, v9, v8
+; GFX950-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX950-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_add_f32_e32 v6, v9, v6
+; GFX950-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_add_f32_e32 v5, v9, v5
+; GFX950-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v8bf16:
; GFX10: ; %bb.0:
@@ -10656,148 +11261,209 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fadd_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT: v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add_f32_e32 v7, v7, v15
-; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT: v_add_f32_e32 v15, v17, v15
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v6, v6, v14
-; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT: v_add_f32_e32 v14, v17, v14
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v5, v5, v13
-; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT: v_add_f32_e32 v13, v17, v13
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v4, v4, v12
-; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT: v_add_f32_e32 v12, v17, v12
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v3, v3, v11
-; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v11, v17, v11
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v2, v2, v10
-; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
-; GFX9-NEXT: v_add_f32_e32 v10, v17, v10
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v1, v1, v9
-; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
-; GFX9-NEXT: v_add_f32_e32 v9, v17, v9
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
-; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
-; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
-; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
-; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
-; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
-; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_v16bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX900-NEXT: v_add_f32_e32 v16, v17, v16
+; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add_f32_e32 v7, v7, v15
+; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX900-NEXT: v_add_f32_e32 v15, v17, v15
+; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v6, v6, v14
+; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX900-NEXT: v_add_f32_e32 v14, v17, v14
+; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v5, v5, v13
+; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX900-NEXT: v_add_f32_e32 v13, v17, v13
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v4, v4, v12
+; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX900-NEXT: v_add_f32_e32 v12, v17, v12
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v3, v3, v11
+; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX900-NEXT: v_add_f32_e32 v11, v17, v11
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v2, v2, v10
+; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX900-NEXT: v_add_f32_e32 v10, v17, v10
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v1, v1, v9
+; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX900-NEXT: v_add_f32_e32 v9, v17, v9
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v8
+; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4
+; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4
+; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4
+; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4
+; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4
+; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4
+; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_v16bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_add_f32_e32 v16, v17, v16
+; GFX950-NEXT: v_add_f32_e32 v7, v7, v15
+; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_add_f32_e32 v15, v17, v15
+; GFX950-NEXT: v_add_f32_e32 v6, v6, v14
+; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_add_f32_e32 v14, v17, v14
+; GFX950-NEXT: v_add_f32_e32 v5, v5, v13
+; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_add_f32_e32 v13, v17, v13
+; GFX950-NEXT: v_add_f32_e32 v4, v4, v12
+; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_add_f32_e32 v12, v17, v12
+; GFX950-NEXT: v_add_f32_e32 v3, v3, v11
+; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_add_f32_e32 v11, v17, v11
+; GFX950-NEXT: v_add_f32_e32 v2, v2, v10
+; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_add_f32_e32 v10, v17, v10
+; GFX950-NEXT: v_add_f32_e32 v1, v1, v9
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_add_f32_e32 v9, v17, v9
+; GFX950-NEXT: v_add_f32_e32 v0, v0, v8
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v16bf16:
; GFX10: ; %bb.0:
@@ -12112,286 +12778,407 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fadd_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
-; GFX9-NEXT: v_add_f32_e32 v31, v32, v31
-; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v14, v14, v30
-; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
-; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
-; GFX9-NEXT: v_add_f32_e32 v30, v32, v30
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v13, v13, v29
-; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
-; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
-; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; GFX9-NEXT: v_add_f32_e32 v32, v32, v29
-; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_add_f32_e32 v12, v12, v28
-; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX9-NEXT: v_add_f32_e32 v33, v33, v34
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT: v_add_f32_e32 v29, v15, v29
-; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
-; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
-; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
-; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
-; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
-; GFX9-NEXT: v_add_f32_e32 v28, v33, v28
-; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v11, v11, v27
-; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: v_add_f32_e32 v27, v33, v27
-; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v10, v10, v26
-; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
-; GFX9-NEXT: v_add_f32_e32 v26, v33, v26
-; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v9, v9, v25
-; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
-; GFX9-NEXT: v_add_f32_e32 v25, v33, v25
-; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v8, v8, v24
-; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
-; GFX9-NEXT: v_add_f32_e32 v24, v33, v24
-; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v7, v7, v23
-; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
-; GFX9-NEXT: v_add_f32_e32 v23, v33, v23
-; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v6, v6, v22
-; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
-; GFX9-NEXT: v_add_f32_e32 v22, v33, v22
-; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v5, v5, v21
-; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
-; GFX9-NEXT: v_add_f32_e32 v21, v33, v21
-; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v4, v4, v20
-; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
-; GFX9-NEXT: v_add_f32_e32 v20, v33, v20
-; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v3, v3, v19
-; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v19, v33, v19
-; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v2, v2, v18
-; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
-; GFX9-NEXT: v_add_f32_e32 v18, v33, v18
-; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v1, v1, v17
-; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
-; GFX9-NEXT: v_add_f32_e32 v17, v33, v17
-; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v16
-; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
-; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
-; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
-; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
-; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
-; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
-; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_v32bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; GFX900-NEXT: v_add_f32_e32 v31, v32, v31
+; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v14, v14, v30
+; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4
+; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX900-NEXT: v_add_f32_e32 v30, v32, v30
+; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v13, v13, v29
+; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
+; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4
+; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX900-NEXT: v_add_f32_e32 v32, v32, v29
+; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15
+; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT: v_add_f32_e32 v12, v12, v28
+; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1
+; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29
+; GFX900-NEXT: v_add_f32_e32 v33, v33, v34
+; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT: v_add_f32_e32 v29, v15, v29
+; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1
+; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
+; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1
+; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11
+; GFX900-NEXT: v_add_f32_e32 v28, v33, v28
+; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v11, v11, v27
+; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10
+; GFX900-NEXT: v_add_f32_e32 v27, v33, v27
+; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v10, v10, v26
+; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9
+; GFX900-NEXT: v_add_f32_e32 v26, v33, v26
+; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v9, v9, v25
+; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8
+; GFX900-NEXT: v_add_f32_e32 v25, v33, v25
+; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v8, v8, v24
+; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX900-NEXT: v_add_f32_e32 v24, v33, v24
+; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v7, v7, v23
+; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6
+; GFX900-NEXT: v_add_f32_e32 v23, v33, v23
+; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v6, v6, v22
+; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5
+; GFX900-NEXT: v_add_f32_e32 v22, v33, v22
+; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v5, v5, v21
+; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4
+; GFX900-NEXT: v_add_f32_e32 v21, v33, v21
+; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v4, v4, v20
+; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3
+; GFX900-NEXT: v_add_f32_e32 v20, v33, v20
+; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v3, v3, v19
+; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2
+; GFX900-NEXT: v_add_f32_e32 v19, v33, v19
+; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v2, v2, v18
+; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1
+; GFX900-NEXT: v_add_f32_e32 v18, v33, v18
+; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v1, v1, v17
+; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0
+; GFX900-NEXT: v_add_f32_e32 v17, v33, v17
+; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v16
+; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4
+; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4
+; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4
+; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4
+; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4
+; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4
+; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4
+; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4
+; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4
+; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4
+; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4
+; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4
+; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4
+; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4
+; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_v32bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_load_dword v31, off, s32
+; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
+; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
+; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
+; GFX950-NEXT: v_add_f32_e32 v8, v8, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_add_f32_e32 v7, v7, v23
+; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
+; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
+; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27
+; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
+; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_add_f32_e32 v33, v34, v33
+; GFX950-NEXT: v_add_f32_e32 v14, v14, v30
+; GFX950-NEXT: v_add_f32_e32 v30, v36, v35
+; GFX950-NEXT: v_add_f32_e32 v13, v13, v29
+; GFX950-NEXT: v_add_f32_e32 v29, v38, v37
+; GFX950-NEXT: v_add_f32_e32 v12, v12, v28
+; GFX950-NEXT: v_add_f32_e32 v28, v48, v39
+; GFX950-NEXT: v_add_f32_e32 v11, v11, v27
+; GFX950-NEXT: v_add_f32_e32 v27, v50, v49
+; GFX950-NEXT: v_add_f32_e32 v10, v10, v26
+; GFX950-NEXT: v_add_f32_e32 v26, v52, v51
+; GFX950-NEXT: v_add_f32_e32 v9, v9, v25
+; GFX950-NEXT: v_add_f32_e32 v25, v54, v53
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31
+; GFX950-NEXT: v_add_f32_e32 v24, v32, v24
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_add_f32_e32 v23, v32, v23
+; GFX950-NEXT: v_add_f32_e32 v6, v6, v22
+; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_add_f32_e32 v22, v32, v22
+; GFX950-NEXT: v_add_f32_e32 v5, v5, v21
+; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_add_f32_e32 v21, v32, v21
+; GFX950-NEXT: v_add_f32_e32 v4, v4, v20
+; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_add_f32_e32 v20, v32, v20
+; GFX950-NEXT: v_add_f32_e32 v3, v3, v19
+; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_add_f32_e32 v19, v32, v19
+; GFX950-NEXT: v_add_f32_e32 v2, v2, v18
+; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX950-NEXT: v_add_f32_e32 v18, v32, v18
+; GFX950-NEXT: v_add_f32_e32 v1, v1, v17
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_add_f32_e32 v15, v15, v31
+; GFX950-NEXT: v_add_f32_e32 v31, v40, v55
+; GFX950-NEXT: v_add_f32_e32 v17, v32, v17
+; GFX950-NEXT: v_add_f32_e32 v0, v0, v16
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24
+; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v32bf16:
; GFX10: ; %bb.0:
@@ -13290,19 +14077,27 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fadd_bf16_fpimm_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_bf16_fpimm_0:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_bf16_fpimm_0:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_bf16_fpimm_0:
; GFX10: ; %bb.0:
@@ -13386,19 +14181,27 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fadd_bf16_fpimm_1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_add_f32_e32 v0, 0x42280000, v0
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_bf16_fpimm_1:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_add_f32_e32 v0, 0x42280000, v0
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_bf16_fpimm_1:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_add_f32_e32 v0, 0x42280000, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_bf16_fpimm_1:
; GFX10: ; %bb.0:
@@ -13487,20 +14290,29 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fsub_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fsub_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fsub_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fsub_bf16:
; GFX10: ; %bb.0:
@@ -13615,29 +14427,41 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fsub_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fsub_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_sub_f32_e32 v2, v3, v2
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fsub_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_sub_f32_e32 v2, v3, v2
+; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fsub_v2bf16:
; GFX10: ; %bb.0:
@@ -13800,38 +14624,54 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fsub_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fsub_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_sub_f32_e32 v3, v4, v3
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fsub_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_sub_f32_e32 v3, v4, v3
+; GFX950-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fsub_v3bf16:
; GFX10: ; %bb.0:
@@ -14041,46 +14881,65 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fsub_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fsub_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_sub_f32_e32 v4, v5, v4
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_sub_f32_e32 v3, v5, v3
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fsub_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_sub_f32_e32 v4, v5, v4
+; GFX950-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_sub_f32_e32 v3, v5, v3
+; GFX950-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fsub_v4bf16:
; GFX10: ; %bb.0:
@@ -14249,20 +15108,29 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmul_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_bf16:
; GFX10: ; %bb.0:
@@ -14377,29 +15245,41 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmul_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v2bf16:
; GFX10: ; %bb.0:
@@ -14562,38 +15442,54 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmul_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v3bf16:
; GFX10: ; %bb.0:
@@ -14803,46 +15699,65 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmul_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v4bf16:
; GFX10: ; %bb.0:
@@ -15166,80 +16081,113 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmul_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7
-; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6
-; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
-; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
-; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
-; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_v8bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX900-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_mul_f32_e32 v3, v3, v7
+; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX900-NEXT: v_mul_f32_e32 v7, v9, v7
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v2, v2, v6
+; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX900-NEXT: v_mul_f32_e32 v6, v9, v6
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX900-NEXT: v_mul_f32_e32 v5, v9, v5
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4
+; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_v8bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX950-NEXT: v_mul_f32_e32 v3, v3, v7
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_mul_f32_e32 v7, v9, v7
+; GFX950-NEXT: v_mul_f32_e32 v2, v2, v6
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_mul_f32_e32 v6, v9, v6
+; GFX950-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_mul_f32_e32 v5, v9, v5
+; GFX950-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v8bf16:
; GFX10: ; %bb.0:
@@ -15855,148 +16803,209 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmul_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15
-; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14
-; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13
-; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12
-; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11
-; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10
-; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
-; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9
-; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8
-; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
-; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
-; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
-; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
-; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
-; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
-; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_v16bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX900-NEXT: v_mul_f32_e32 v16, v17, v16
+; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_mul_f32_e32 v7, v7, v15
+; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX900-NEXT: v_mul_f32_e32 v15, v17, v15
+; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v6, v6, v14
+; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX900-NEXT: v_mul_f32_e32 v14, v17, v14
+; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v5, v5, v13
+; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX900-NEXT: v_mul_f32_e32 v13, v17, v13
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v4, v4, v12
+; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX900-NEXT: v_mul_f32_e32 v12, v17, v12
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v3, v3, v11
+; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX900-NEXT: v_mul_f32_e32 v11, v17, v11
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v2, v2, v10
+; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX900-NEXT: v_mul_f32_e32 v10, v17, v10
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX900-NEXT: v_mul_f32_e32 v9, v17, v9
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v0, v0, v8
+; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4
+; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4
+; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4
+; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4
+; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4
+; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4
+; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_v16bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_mul_f32_e32 v16, v17, v16
+; GFX950-NEXT: v_mul_f32_e32 v7, v7, v15
+; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_mul_f32_e32 v15, v17, v15
+; GFX950-NEXT: v_mul_f32_e32 v6, v6, v14
+; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_mul_f32_e32 v14, v17, v14
+; GFX950-NEXT: v_mul_f32_e32 v5, v5, v13
+; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_mul_f32_e32 v13, v17, v13
+; GFX950-NEXT: v_mul_f32_e32 v4, v4, v12
+; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_mul_f32_e32 v12, v17, v12
+; GFX950-NEXT: v_mul_f32_e32 v3, v3, v11
+; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_mul_f32_e32 v11, v17, v11
+; GFX950-NEXT: v_mul_f32_e32 v2, v2, v10
+; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_mul_f32_e32 v10, v17, v10
+; GFX950-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_mul_f32_e32 v9, v17, v9
+; GFX950-NEXT: v_mul_f32_e32 v0, v0, v8
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v16bf16:
; GFX10: ; %bb.0:
@@ -17311,286 +18320,407 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmul_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
-; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31
-; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30
-; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
-; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
-; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29
-; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
-; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
-; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; GFX9-NEXT: v_mul_f32_e32 v32, v32, v29
-; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28
-; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX9-NEXT: v_mul_f32_e32 v33, v33, v34
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT: v_mul_f32_e32 v29, v15, v29
-; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
-; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
-; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
-; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
-; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
-; GFX9-NEXT: v_mul_f32_e32 v28, v33, v28
-; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27
-; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: v_mul_f32_e32 v27, v33, v27
-; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26
-; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
-; GFX9-NEXT: v_mul_f32_e32 v26, v33, v26
-; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25
-; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
-; GFX9-NEXT: v_mul_f32_e32 v25, v33, v25
-; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24
-; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
-; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24
-; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23
-; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
-; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23
-; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22
-; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
-; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22
-; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21
-; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
-; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21
-; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20
-; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
-; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20
-; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19
-; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
-; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19
-; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18
-; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
-; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18
-; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17
-; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17
-; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16
-; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
-; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
-; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
-; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
-; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
-; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
-; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_v32bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; GFX900-NEXT: v_mul_f32_e32 v31, v32, v31
+; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v14, v14, v30
+; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4
+; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX900-NEXT: v_mul_f32_e32 v30, v32, v30
+; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
+; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4
+; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX900-NEXT: v_mul_f32_e32 v32, v32, v29
+; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15
+; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT: v_mul_f32_e32 v12, v12, v28
+; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1
+; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29
+; GFX900-NEXT: v_mul_f32_e32 v33, v33, v34
+; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT: v_mul_f32_e32 v29, v15, v29
+; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1
+; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
+; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1
+; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11
+; GFX900-NEXT: v_mul_f32_e32 v28, v33, v28
+; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10
+; GFX900-NEXT: v_mul_f32_e32 v27, v33, v27
+; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9
+; GFX900-NEXT: v_mul_f32_e32 v26, v33, v26
+; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v9, v9, v25
+; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8
+; GFX900-NEXT: v_mul_f32_e32 v25, v33, v25
+; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v8, v8, v24
+; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX900-NEXT: v_mul_f32_e32 v24, v33, v24
+; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v7, v7, v23
+; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6
+; GFX900-NEXT: v_mul_f32_e32 v23, v33, v23
+; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v6, v6, v22
+; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5
+; GFX900-NEXT: v_mul_f32_e32 v22, v33, v22
+; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v5, v5, v21
+; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4
+; GFX900-NEXT: v_mul_f32_e32 v21, v33, v21
+; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v4, v4, v20
+; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3
+; GFX900-NEXT: v_mul_f32_e32 v20, v33, v20
+; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v3, v3, v19
+; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2
+; GFX900-NEXT: v_mul_f32_e32 v19, v33, v19
+; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v2, v2, v18
+; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1
+; GFX900-NEXT: v_mul_f32_e32 v18, v33, v18
+; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v1, v1, v17
+; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0
+; GFX900-NEXT: v_mul_f32_e32 v17, v33, v17
+; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v0, v0, v16
+; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4
+; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4
+; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4
+; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4
+; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4
+; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4
+; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4
+; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4
+; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4
+; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4
+; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4
+; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4
+; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4
+; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4
+; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_v32bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_load_dword v31, off, s32
+; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
+; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
+; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
+; GFX950-NEXT: v_mul_f32_e32 v8, v8, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_mul_f32_e32 v7, v7, v23
+; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
+; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
+; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27
+; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
+; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_mul_f32_e32 v33, v34, v33
+; GFX950-NEXT: v_mul_f32_e32 v14, v14, v30
+; GFX950-NEXT: v_mul_f32_e32 v30, v36, v35
+; GFX950-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX950-NEXT: v_mul_f32_e32 v29, v38, v37
+; GFX950-NEXT: v_mul_f32_e32 v12, v12, v28
+; GFX950-NEXT: v_mul_f32_e32 v28, v48, v39
+; GFX950-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX950-NEXT: v_mul_f32_e32 v27, v50, v49
+; GFX950-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX950-NEXT: v_mul_f32_e32 v26, v52, v51
+; GFX950-NEXT: v_mul_f32_e32 v9, v9, v25
+; GFX950-NEXT: v_mul_f32_e32 v25, v54, v53
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31
+; GFX950-NEXT: v_mul_f32_e32 v24, v32, v24
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_mul_f32_e32 v23, v32, v23
+; GFX950-NEXT: v_mul_f32_e32 v6, v6, v22
+; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_mul_f32_e32 v22, v32, v22
+; GFX950-NEXT: v_mul_f32_e32 v5, v5, v21
+; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_mul_f32_e32 v21, v32, v21
+; GFX950-NEXT: v_mul_f32_e32 v4, v4, v20
+; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_mul_f32_e32 v20, v32, v20
+; GFX950-NEXT: v_mul_f32_e32 v3, v3, v19
+; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_mul_f32_e32 v19, v32, v19
+; GFX950-NEXT: v_mul_f32_e32 v2, v2, v18
+; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX950-NEXT: v_mul_f32_e32 v18, v32, v18
+; GFX950-NEXT: v_mul_f32_e32 v1, v1, v17
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_mul_f32_e32 v15, v15, v31
+; GFX950-NEXT: v_mul_f32_e32 v31, v40, v55
+; GFX950-NEXT: v_mul_f32_e32 v17, v32, v17
+; GFX950-NEXT: v_mul_f32_e32 v0, v0, v16
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24
+; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v32bf16:
; GFX10: ; %bb.0:
@@ -18524,30 +19654,50 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fdiv_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_rcp_f32_e32 v4, v2
-; GFX9-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; GFX9-NEXT: v_fma_f32 v4, v5, v4, v4
-; GFX9-NEXT: v_mul_f32_e32 v5, v3, v4
-; GFX9-NEXT: v_fma_f32 v6, -v2, v5, v3
-; GFX9-NEXT: v_fma_f32 v5, v6, v4, v5
-; GFX9-NEXT: v_fma_f32 v2, -v2, v5, v3
-; GFX9-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; GFX9-NEXT: v_div_fixup_f32 v0, v2, v1, v0
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fdiv_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX900-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_rcp_f32_e32 v4, v2
+; GFX900-NEXT: v_fma_f32 v5, -v2, v4, 1.0
+; GFX900-NEXT: v_fma_f32 v4, v5, v4, v4
+; GFX900-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX900-NEXT: v_fma_f32 v6, -v2, v5, v3
+; GFX900-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX900-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GFX900-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX900-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fdiv_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX950-NEXT: v_rcp_f32_e32 v3, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_fma_f32 v4, -v2, v3, 1.0
+; GFX950-NEXT: v_fmac_f32_e32 v3, v4, v3
+; GFX950-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX950-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX950-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX950-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX950-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX950-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX950-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_bf16:
; GFX10: ; %bb.0:
@@ -18996,20 +20146,29 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_minnum_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_bf16:
; GFX10: ; %bb.0:
@@ -19124,29 +20283,41 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_minnum_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT: v_min_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_min_f32_e32 v2, v3, v2
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_min_f32_e32 v2, v3, v2
+; GFX950-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v2bf16:
; GFX10: ; %bb.0:
@@ -19309,38 +20480,54 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_minnum_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_min_f32_e32 v3, v4, v3
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_min_f32_e32 v3, v4, v3
+; GFX950-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v3bf16:
; GFX10: ; %bb.0:
@@ -19550,46 +20737,65 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_minnum_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_min_f32_e32 v4, v5, v4
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_min_f32_e32 v4, v5, v4
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_min_f32_e32 v3, v5, v3
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_min_f32_e32 v4, v5, v4
+; GFX950-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_min_f32_e32 v3, v5, v3
+; GFX950-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v4bf16:
; GFX10: ; %bb.0:
@@ -19913,80 +21119,113 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_minnum_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT: v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_min_f32_e32 v3, v3, v7
-; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT: v_min_f32_e32 v7, v9, v7
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
-; GFX9-NEXT: v_min_f32_e32 v6, v9, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v1, v1, v5
-; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX9-NEXT: v_min_f32_e32 v5, v9, v5
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
-; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_v8bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX900-NEXT: v_min_f32_e32 v8, v9, v8
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX900-NEXT: v_min_f32_e32 v7, v9, v7
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX900-NEXT: v_min_f32_e32 v6, v9, v6
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX900-NEXT: v_min_f32_e32 v5, v9, v5
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4
+; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_v8bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_min_f32_e32 v8, v9, v8
+; GFX950-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_min_f32_e32 v7, v9, v7
+; GFX950-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_min_f32_e32 v6, v9, v6
+; GFX950-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_min_f32_e32 v5, v9, v5
+; GFX950-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v8bf16:
; GFX10: ; %bb.0:
@@ -20602,148 +21841,209 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_minnum_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT: v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_min_f32_e32 v7, v7, v15
-; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT: v_min_f32_e32 v15, v17, v15
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v6, v6, v14
-; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT: v_min_f32_e32 v14, v17, v14
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v5, v5, v13
-; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT: v_min_f32_e32 v13, v17, v13
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v4, v4, v12
-; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT: v_min_f32_e32 v12, v17, v12
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v3, v3, v11
-; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT: v_min_f32_e32 v11, v17, v11
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v2, v2, v10
-; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
-; GFX9-NEXT: v_min_f32_e32 v10, v17, v10
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v1, v1, v9
-; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
-; GFX9-NEXT: v_min_f32_e32 v9, v17, v9
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v0, v0, v8
-; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
-; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
-; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
-; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
-; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
-; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
-; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_v16bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX900-NEXT: v_min_f32_e32 v16, v17, v16
+; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_min_f32_e32 v7, v7, v15
+; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX900-NEXT: v_min_f32_e32 v15, v17, v15
+; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v6, v6, v14
+; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX900-NEXT: v_min_f32_e32 v14, v17, v14
+; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v5, v5, v13
+; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX900-NEXT: v_min_f32_e32 v13, v17, v13
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v4, v4, v12
+; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX900-NEXT: v_min_f32_e32 v12, v17, v12
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v3, v3, v11
+; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX900-NEXT: v_min_f32_e32 v11, v17, v11
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v2, v2, v10
+; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX900-NEXT: v_min_f32_e32 v10, v17, v10
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX900-NEXT: v_min_f32_e32 v9, v17, v9
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v0, v0, v8
+; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4
+; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4
+; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4
+; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4
+; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4
+; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4
+; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_v16bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_min_f32_e32 v16, v17, v16
+; GFX950-NEXT: v_min_f32_e32 v7, v7, v15
+; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_min_f32_e32 v15, v17, v15
+; GFX950-NEXT: v_min_f32_e32 v6, v6, v14
+; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_min_f32_e32 v14, v17, v14
+; GFX950-NEXT: v_min_f32_e32 v5, v5, v13
+; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_min_f32_e32 v13, v17, v13
+; GFX950-NEXT: v_min_f32_e32 v4, v4, v12
+; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_min_f32_e32 v12, v17, v12
+; GFX950-NEXT: v_min_f32_e32 v3, v3, v11
+; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_min_f32_e32 v11, v17, v11
+; GFX950-NEXT: v_min_f32_e32 v2, v2, v10
+; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_min_f32_e32 v10, v17, v10
+; GFX950-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_min_f32_e32 v9, v17, v9
+; GFX950-NEXT: v_min_f32_e32 v0, v0, v8
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v16bf16:
; GFX10: ; %bb.0:
@@ -22058,286 +23358,407 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_minnum_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
-; GFX9-NEXT: v_min_f32_e32 v31, v32, v31
-; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v14, v14, v30
-; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
-; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
-; GFX9-NEXT: v_min_f32_e32 v30, v32, v30
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v13, v13, v29
-; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
-; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
-; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; GFX9-NEXT: v_min_f32_e32 v32, v32, v29
-; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX9-NEXT: v_min_f32_e32 v33, v33, v34
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT: v_min_f32_e32 v29, v15, v29
-; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
-; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
-; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
-; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
-; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
-; GFX9-NEXT: v_min_f32_e32 v28, v33, v28
-; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v11, v11, v27
-; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: v_min_f32_e32 v27, v33, v27
-; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v10, v10, v26
-; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
-; GFX9-NEXT: v_min_f32_e32 v26, v33, v26
-; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v9, v9, v25
-; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
-; GFX9-NEXT: v_min_f32_e32 v25, v33, v25
-; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v8, v8, v24
-; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
-; GFX9-NEXT: v_min_f32_e32 v24, v33, v24
-; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
-; GFX9-NEXT: v_min_f32_e32 v23, v33, v23
-; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v6, v6, v22
-; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
-; GFX9-NEXT: v_min_f32_e32 v22, v33, v22
-; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v5, v5, v21
-; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
-; GFX9-NEXT: v_min_f32_e32 v21, v33, v21
-; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v4, v4, v20
-; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
-; GFX9-NEXT: v_min_f32_e32 v20, v33, v20
-; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v3, v3, v19
-; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
-; GFX9-NEXT: v_min_f32_e32 v19, v33, v19
-; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v2, v2, v18
-; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
-; GFX9-NEXT: v_min_f32_e32 v18, v33, v18
-; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v1, v1, v17
-; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
-; GFX9-NEXT: v_min_f32_e32 v17, v33, v17
-; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT: v_min_f32_e32 v0, v0, v16
-; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
-; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
-; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
-; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
-; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
-; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
-; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_v32bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; GFX900-NEXT: v_min_f32_e32 v31, v32, v31
+; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v14, v14, v30
+; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4
+; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX900-NEXT: v_min_f32_e32 v30, v32, v30
+; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
+; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4
+; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX900-NEXT: v_min_f32_e32 v32, v32, v29
+; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15
+; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT: v_min_f32_e32 v12, v12, v28
+; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1
+; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29
+; GFX900-NEXT: v_min_f32_e32 v33, v33, v34
+; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT: v_min_f32_e32 v29, v15, v29
+; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1
+; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
+; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1
+; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11
+; GFX900-NEXT: v_min_f32_e32 v28, v33, v28
+; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10
+; GFX900-NEXT: v_min_f32_e32 v27, v33, v27
+; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9
+; GFX900-NEXT: v_min_f32_e32 v26, v33, v26
+; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v9, v9, v25
+; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8
+; GFX900-NEXT: v_min_f32_e32 v25, v33, v25
+; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v8, v8, v24
+; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX900-NEXT: v_min_f32_e32 v24, v33, v24
+; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v7, v7, v23
+; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6
+; GFX900-NEXT: v_min_f32_e32 v23, v33, v23
+; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5
+; GFX900-NEXT: v_min_f32_e32 v22, v33, v22
+; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v5, v5, v21
+; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4
+; GFX900-NEXT: v_min_f32_e32 v21, v33, v21
+; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3
+; GFX900-NEXT: v_min_f32_e32 v20, v33, v20
+; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2
+; GFX900-NEXT: v_min_f32_e32 v19, v33, v19
+; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v2, v2, v18
+; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1
+; GFX900-NEXT: v_min_f32_e32 v18, v33, v18
+; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0
+; GFX900-NEXT: v_min_f32_e32 v17, v33, v17
+; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1
+; GFX900-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4
+; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4
+; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4
+; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4
+; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4
+; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4
+; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4
+; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4
+; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4
+; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4
+; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4
+; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4
+; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4
+; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4
+; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_v32bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_load_dword v31, off, s32
+; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
+; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
+; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
+; GFX950-NEXT: v_min_f32_e32 v8, v8, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_min_f32_e32 v7, v7, v23
+; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
+; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
+; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27
+; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
+; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_min_f32_e32 v33, v34, v33
+; GFX950-NEXT: v_min_f32_e32 v14, v14, v30
+; GFX950-NEXT: v_min_f32_e32 v30, v36, v35
+; GFX950-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX950-NEXT: v_min_f32_e32 v29, v38, v37
+; GFX950-NEXT: v_min_f32_e32 v12, v12, v28
+; GFX950-NEXT: v_min_f32_e32 v28, v48, v39
+; GFX950-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX950-NEXT: v_min_f32_e32 v27, v50, v49
+; GFX950-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX950-NEXT: v_min_f32_e32 v26, v52, v51
+; GFX950-NEXT: v_min_f32_e32 v9, v9, v25
+; GFX950-NEXT: v_min_f32_e32 v25, v54, v53
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31
+; GFX950-NEXT: v_min_f32_e32 v24, v32, v24
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_min_f32_e32 v23, v32, v23
+; GFX950-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_min_f32_e32 v22, v32, v22
+; GFX950-NEXT: v_min_f32_e32 v5, v5, v21
+; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_min_f32_e32 v21, v32, v21
+; GFX950-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_min_f32_e32 v20, v32, v20
+; GFX950-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_min_f32_e32 v19, v32, v19
+; GFX950-NEXT: v_min_f32_e32 v2, v2, v18
+; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX950-NEXT: v_min_f32_e32 v18, v32, v18
+; GFX950-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_min_f32_e32 v15, v15, v31
+; GFX950-NEXT: v_min_f32_e32 v31, v40, v55
+; GFX950-NEXT: v_min_f32_e32 v17, v32, v17
+; GFX950-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24
+; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v32bf16:
; GFX10: ; %bb.0:
@@ -23250,20 +24671,29 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_maxnum_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_bf16:
; GFX10: ; %bb.0:
@@ -23378,29 +24808,41 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_maxnum_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT: v_max_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_max_f32_e32 v2, v3, v2
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_max_f32_e32 v2, v3, v2
+; GFX950-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_v2bf16:
; GFX10: ; %bb.0:
@@ -23563,38 +25005,54 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_maxnum_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_max_f32_e32 v3, v4, v3
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_max_f32_e32 v3, v4, v3
+; GFX950-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_v3bf16:
; GFX10: ; %bb.0:
@@ -23804,46 +25262,65 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_maxnum_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_max_f32_e32 v4, v5, v4
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_max_f32_e32 v4, v5, v4
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_max_f32_e32 v3, v5, v3
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_max_f32_e32 v4, v5, v4
+; GFX950-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_max_f32_e32 v3, v5, v3
+; GFX950-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_v4bf16:
; GFX10: ; %bb.0:
@@ -24167,80 +25644,113 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_maxnum_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT: v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_max_f32_e32 v3, v3, v7
-; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT: v_max_f32_e32 v7, v9, v7
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
-; GFX9-NEXT: v_max_f32_e32 v6, v9, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v5
-; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX9-NEXT: v_max_f32_e32 v5, v9, v5
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v4
-; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
-; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_v8bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v8, v9, v8
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX900-NEXT: v_max_f32_e32 v7, v9, v7
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX900-NEXT: v_max_f32_e32 v6, v9, v6
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX900-NEXT: v_max_f32_e32 v5, v9, v5
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4
+; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_v8bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v8, v9, v8
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_max_f32_e32 v7, v9, v7
+; GFX950-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_max_f32_e32 v6, v9, v6
+; GFX950-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_max_f32_e32 v5, v9, v5
+; GFX950-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_v8bf16:
; GFX10: ; %bb.0:
@@ -24856,148 +26366,209 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_maxnum_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT: v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_max_f32_e32 v7, v7, v15
-; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT: v_max_f32_e32 v15, v17, v15
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v6, v6, v14
-; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT: v_max_f32_e32 v14, v17, v14
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v5, v5, v13
-; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT: v_max_f32_e32 v13, v17, v13
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v4, v4, v12
-; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT: v_max_f32_e32 v12, v17, v12
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v3, v3, v11
-; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT: v_max_f32_e32 v11, v17, v11
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v10
-; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
-; GFX9-NEXT: v_max_f32_e32 v10, v17, v10
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v9
-; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
-; GFX9-NEXT: v_max_f32_e32 v9, v17, v9
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v8
-; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
-; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
-; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
-; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
-; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
-; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
-; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
-; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_v16bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX900-NEXT: v_max_f32_e32 v16, v17, v16
+; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_max_f32_e32 v7, v7, v15
+; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v15, v17, v15
+; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v14
+; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; GFX900-NEXT: v_max_f32_e32 v14, v17, v14
+; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v5, v5, v13
+; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX900-NEXT: v_max_f32_e32 v13, v17, v13
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v4, v4, v12
+; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v12, v17, v12
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v11
+; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX900-NEXT: v_max_f32_e32 v11, v17, v11
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v2, v2, v10
+; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX900-NEXT: v_max_f32_e32 v10, v17, v10
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX900-NEXT: v_max_f32_e32 v9, v17, v9
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v0, v0, v8
+; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4
+; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4
+; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4
+; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4
+; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4
+; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4
+; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4
+; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_v16bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_max_f32_e32 v16, v17, v16
+; GFX950-NEXT: v_max_f32_e32 v7, v7, v15
+; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v15, v17, v15
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v14
+; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_max_f32_e32 v14, v17, v14
+; GFX950-NEXT: v_max_f32_e32 v5, v5, v13
+; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_max_f32_e32 v13, v17, v13
+; GFX950-NEXT: v_max_f32_e32 v4, v4, v12
+; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v12, v17, v12
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v11
+; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_max_f32_e32 v11, v17, v11
+; GFX950-NEXT: v_max_f32_e32 v2, v2, v10
+; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_max_f32_e32 v10, v17, v10
+; GFX950-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_max_f32_e32 v9, v17, v9
+; GFX950-NEXT: v_max_f32_e32 v0, v0, v8
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_v16bf16:
; GFX10: ; %bb.0:
@@ -26312,286 +27883,407 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_maxnum_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
-; GFX9-NEXT: v_max_f32_e32 v31, v32, v31
-; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v14, v14, v30
-; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
-; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
-; GFX9-NEXT: v_max_f32_e32 v30, v32, v30
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v13, v13, v29
-; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
-; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
-; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; GFX9-NEXT: v_max_f32_e32 v32, v32, v29
-; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX9-NEXT: v_max_f32_e32 v33, v33, v34
-; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT: v_max_f32_e32 v29, v15, v29
-; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
-; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
-; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
-; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
-; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
-; GFX9-NEXT: v_max_f32_e32 v28, v33, v28
-; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v11, v11, v27
-; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: v_max_f32_e32 v27, v33, v27
-; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v10, v10, v26
-; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
-; GFX9-NEXT: v_max_f32_e32 v26, v33, v26
-; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v9, v9, v25
-; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
-; GFX9-NEXT: v_max_f32_e32 v25, v33, v25
-; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v8, v8, v24
-; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
-; GFX9-NEXT: v_max_f32_e32 v24, v33, v24
-; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
-; GFX9-NEXT: v_max_f32_e32 v23, v33, v23
-; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v6, v6, v22
-; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
-; GFX9-NEXT: v_max_f32_e32 v22, v33, v22
-; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v5, v5, v21
-; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
-; GFX9-NEXT: v_max_f32_e32 v21, v33, v21
-; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v4, v4, v20
-; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
-; GFX9-NEXT: v_max_f32_e32 v20, v33, v20
-; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v3, v3, v19
-; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
-; GFX9-NEXT: v_max_f32_e32 v19, v33, v19
-; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v18
-; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
-; GFX9-NEXT: v_max_f32_e32 v18, v33, v18
-; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v17
-; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
-; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
-; GFX9-NEXT: v_max_f32_e32 v17, v33, v17
-; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
-; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
-; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
-; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
-; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
-; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
-; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
-; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_v32bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; GFX900-NEXT: v_max_f32_e32 v31, v32, v31
+; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v14, v14, v30
+; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4
+; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX900-NEXT: v_max_f32_e32 v30, v32, v30
+; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
+; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4
+; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX900-NEXT: v_max_f32_e32 v32, v32, v29
+; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15
+; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT: v_max_f32_e32 v12, v12, v28
+; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1
+; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29
+; GFX900-NEXT: v_max_f32_e32 v33, v33, v34
+; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT: v_max_f32_e32 v29, v15, v29
+; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1
+; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
+; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1
+; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11
+; GFX900-NEXT: v_max_f32_e32 v28, v33, v28
+; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10
+; GFX900-NEXT: v_max_f32_e32 v27, v33, v27
+; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9
+; GFX900-NEXT: v_max_f32_e32 v26, v33, v26
+; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v9, v9, v25
+; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8
+; GFX900-NEXT: v_max_f32_e32 v25, v33, v25
+; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v8, v8, v24
+; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX900-NEXT: v_max_f32_e32 v24, v33, v24
+; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v7, v7, v23
+; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v23, v33, v23
+; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5
+; GFX900-NEXT: v_max_f32_e32 v22, v33, v22
+; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v5, v5, v21
+; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4
+; GFX900-NEXT: v_max_f32_e32 v21, v33, v21
+; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v20, v33, v20
+; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2
+; GFX900-NEXT: v_max_f32_e32 v19, v33, v19
+; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v2, v2, v18
+; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1
+; GFX900-NEXT: v_max_f32_e32 v18, v33, v18
+; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0
+; GFX900-NEXT: v_max_f32_e32 v17, v33, v17
+; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1
+; GFX900-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4
+; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
+; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4
+; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4
+; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4
+; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4
+; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4
+; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4
+; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4
+; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4
+; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4
+; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4
+; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4
+; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4
+; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4
+; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4
+; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_v32bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_load_dword v31, off, s32
+; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
+; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
+; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
+; GFX950-NEXT: v_max_f32_e32 v8, v8, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_max_f32_e32 v7, v7, v23
+; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
+; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
+; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27
+; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
+; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_max_f32_e32 v33, v34, v33
+; GFX950-NEXT: v_max_f32_e32 v14, v14, v30
+; GFX950-NEXT: v_max_f32_e32 v30, v36, v35
+; GFX950-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX950-NEXT: v_max_f32_e32 v29, v38, v37
+; GFX950-NEXT: v_max_f32_e32 v12, v12, v28
+; GFX950-NEXT: v_max_f32_e32 v28, v48, v39
+; GFX950-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX950-NEXT: v_max_f32_e32 v27, v50, v49
+; GFX950-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX950-NEXT: v_max_f32_e32 v26, v52, v51
+; GFX950-NEXT: v_max_f32_e32 v9, v9, v25
+; GFX950-NEXT: v_max_f32_e32 v25, v54, v53
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31
+; GFX950-NEXT: v_max_f32_e32 v24, v32, v24
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v23, v32, v23
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_max_f32_e32 v22, v32, v22
+; GFX950-NEXT: v_max_f32_e32 v5, v5, v21
+; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_max_f32_e32 v21, v32, v21
+; GFX950-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v20, v32, v20
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_max_f32_e32 v19, v32, v19
+; GFX950-NEXT: v_max_f32_e32 v2, v2, v18
+; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX950-NEXT: v_max_f32_e32 v18, v32, v18
+; GFX950-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_max_f32_e32 v15, v15, v31
+; GFX950-NEXT: v_max_f32_e32 v31, v40, v55
+; GFX950-NEXT: v_max_f32_e32 v17, v32, v17
+; GFX950-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24
+; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_v32bf16:
; GFX10: ; %bb.0:
@@ -27543,36 +29235,66 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_sqrt_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_mov_b32 s4, 0xf800000
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_sqrt_f32_e32 v1, v0
-; GFX9-NEXT: v_add_u32_e32 v2, -1, v1
-; GFX9-NEXT: v_fma_f32 v3, -v2, v1, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX9-NEXT: v_add_u32_e32 v3, 1, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX9-NEXT: v_fma_f32 v1, -v3, v1, v0
-; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX9-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x260
-; GFX9-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sqrt_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_mov_b32 s4, 0xf800000
+; GFX900-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_sqrt_f32_e32 v1, v0
+; GFX900-NEXT: v_add_u32_e32 v2, -1, v1
+; GFX900-NEXT: v_fma_f32 v3, -v2, v1, v0
+; GFX900-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
+; GFX900-NEXT: v_add_u32_e32 v3, 1, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; GFX900-NEXT: v_fma_f32 v1, -v3, v1, v0
+; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; GFX900-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX900-NEXT: v_mov_b32_e32 v2, 0x260
+; GFX900-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sqrt_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: s_mov_b32 s0, 0xf800000
+; GFX950-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_sqrt_f32_e32 v1, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_add_u32_e32 v2, -1, v1
+; GFX950-NEXT: v_fma_f32 v3, -v2, v1, v0
+; GFX950-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v3
+; GFX950-NEXT: v_add_u32_e32 v3, 1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1]
+; GFX950-NEXT: v_fma_f32 v1, -v3, v1, v0
+; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1]
+; GFX950-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX950-NEXT: v_mov_b32_e32 v2, 0x260
+; GFX950-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sqrt_bf16:
; GFX10: ; %bb.0:
@@ -27715,19 +29437,27 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_ldexp_bf16_i32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_ldexp_bf16_i32:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_ldexp_bf16_i32:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_ldexp_bf16_i32:
; GFX10: ; %bb.0:
@@ -27820,20 +29550,29 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_frexp_bf16_i16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_frexp_mant_f32_e32 v0, v1
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_frexp_bf16_i16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX900-NEXT: v_frexp_mant_f32_e32 v0, v1
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_frexp_bf16_i16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX950-NEXT: v_frexp_mant_f32_e32 v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_frexp_bf16_i16:
; GFX10: ; %bb.0:
@@ -27979,35 +29718,61 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_log_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x800000
-; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT: v_log_f32_e32 v0, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x3f317217
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1
-; GFX9-NEXT: s_mov_b32 s4, 0x3377d1cf
-; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7f800000
-; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x41b17218
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_mov_b32 s4, 0x800000
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
+; GFX900-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-NEXT: v_log_f32_e32 v0, v0
+; GFX900-NEXT: s_mov_b32 s4, 0x3f317217
+; GFX900-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX900-NEXT: v_fma_f32 v2, v0, s4, -v1
+; GFX900-NEXT: s_mov_b32 s4, 0x3377d1cf
+; GFX900-NEXT: v_fma_f32 v2, v0, s4, v2
+; GFX900-NEXT: s_mov_b32 s4, 0x7f800000
+; GFX900-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX900-NEXT: v_mov_b32_e32 v1, 0x41b17218
+; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_log_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x800000
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x3f317217
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
+; GFX950-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX950-NEXT: v_log_f32_e32 v0, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX950-NEXT: v_fma_f32 v2, v0, s0, -v1
+; GFX950-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX950-NEXT: s_mov_b32 s0, 0x7f800000
+; GFX950-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v1, 0x41b17218
+; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_log_bf16:
; GFX10: ; %bb.0:
@@ -28153,26 +29918,42 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_log2_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x800000
-; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: v_log_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_mov_b32 s4, 0x800000
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
+; GFX900-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT: v_log_f32_e32 v0, v0
+; GFX900-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_log2_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x800000
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX950-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
+; GFX950-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT: v_log_f32_e32 v0, v0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_log2_bf16:
; GFX10: ; %bb.0:
@@ -28329,35 +30110,61 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_log10_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x800000
-; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT: v_log_f32_e32 v0, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x3e9a209a
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1
-; GFX9-NEXT: s_mov_b32 s4, 0x3284fbcf
-; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x7f800000
-; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x411a209b
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log10_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_mov_b32 s4, 0x800000
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
+; GFX900-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-NEXT: v_log_f32_e32 v0, v0
+; GFX900-NEXT: s_mov_b32 s4, 0x3e9a209a
+; GFX900-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX900-NEXT: v_fma_f32 v2, v0, s4, -v1
+; GFX900-NEXT: s_mov_b32 s4, 0x3284fbcf
+; GFX900-NEXT: v_fma_f32 v2, v0, s4, v2
+; GFX900-NEXT: s_mov_b32 s4, 0x7f800000
+; GFX900-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX900-NEXT: v_mov_b32_e32 v1, 0x411a209b
+; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_log10_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x800000
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x3e9a209a
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
+; GFX950-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX950-NEXT: v_log_f32_e32 v0, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX950-NEXT: v_fma_f32 v2, v0, s0, -v1
+; GFX950-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX950-NEXT: s_mov_b32 s0, 0x7f800000
+; GFX950-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v1, 0x411a209b
+; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_log10_bf16:
; GFX10: ; %bb.0:
@@ -28541,36 +30348,61 @@ define bfloat @v_exp_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_exp_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x3fb8aa3b
-; GFX9-NEXT: v_rndne_f32_e32 v2, v1
-; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2
-; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1
-; GFX9-NEXT: s_mov_b32 s4, 0x32a5705f
-; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1
-; GFX9-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX9-NEXT: v_exp_f32_e32 v1, v1
-; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT: s_mov_b32 s4, 0xc2ce8ed0
-; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x42b17218
-; GFX9-NEXT: v_ldexp_f32 v1, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_exp_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
+; GFX900-NEXT: s_mov_b32 s4, 0x3fb8aa3b
+; GFX900-NEXT: v_rndne_f32_e32 v2, v1
+; GFX900-NEXT: v_sub_f32_e32 v3, v1, v2
+; GFX900-NEXT: v_fma_f32 v1, v0, s4, -v1
+; GFX900-NEXT: s_mov_b32 s4, 0x32a5705f
+; GFX900-NEXT: v_fma_f32 v1, v0, s4, v1
+; GFX900-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX900-NEXT: v_exp_f32_e32 v1, v1
+; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX900-NEXT: s_mov_b32 s4, 0xc2ce8ed0
+; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
+; GFX900-NEXT: s_mov_b32 s4, 0x42b17218
+; GFX900-NEXT: v_ldexp_f32 v1, v1, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX900-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_exp_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x3fb8aa3b
+; GFX950-NEXT: v_rndne_f32_e32 v2, v1
+; GFX950-NEXT: v_sub_f32_e32 v3, v1, v2
+; GFX950-NEXT: v_fma_f32 v1, v0, s0, -v1
+; GFX950-NEXT: v_fmamk_f32 v1, v0, 0x32a5705f, v1
+; GFX950-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX950-NEXT: v_exp_f32_e32 v1, v1
+; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX950-NEXT: s_mov_b32 s0, 0xc2ce8ed0
+; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x42b17218
+; GFX950-NEXT: v_ldexp_f32 v1, v1, v2
+; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX950-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_exp_bf16:
; GFX10: ; %bb.0:
@@ -28722,27 +30554,43 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_exp2_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000
-; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_not_b32_e32 v1, 63
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_exp2_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_mov_b32 s4, 0xc2fc0000
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-NEXT: v_not_b32_e32 v1, 63
+; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_exp2_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: s_mov_b32 s0, 0xc2fc0000
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX950-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX950-NEXT: v_not_b32_e32 v1, 63
+; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX950-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_exp_f32_e32 v0, v0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX950-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_exp2_bf16:
; GFX10: ; %bb.0:
@@ -28900,36 +30748,61 @@ define bfloat @v_exp10_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_exp10_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x40549a78
-; GFX9-NEXT: v_rndne_f32_e32 v2, v1
-; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2
-; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1
-; GFX9-NEXT: s_mov_b32 s4, 0x33979a37
-; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1
-; GFX9-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX9-NEXT: v_exp_f32_e32 v1, v1
-; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT: s_mov_b32 s4, 0xc23369f4
-; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x421a209b
-; GFX9-NEXT: v_ldexp_f32 v1, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_exp10_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
+; GFX900-NEXT: s_mov_b32 s4, 0x40549a78
+; GFX900-NEXT: v_rndne_f32_e32 v2, v1
+; GFX900-NEXT: v_sub_f32_e32 v3, v1, v2
+; GFX900-NEXT: v_fma_f32 v1, v0, s4, -v1
+; GFX900-NEXT: s_mov_b32 s4, 0x33979a37
+; GFX900-NEXT: v_fma_f32 v1, v0, s4, v1
+; GFX900-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX900-NEXT: v_exp_f32_e32 v1, v1
+; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX900-NEXT: s_mov_b32 s4, 0xc23369f4
+; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
+; GFX900-NEXT: s_mov_b32 s4, 0x421a209b
+; GFX900-NEXT: v_ldexp_f32 v1, v1, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX900-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_exp10_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x40549a78
+; GFX950-NEXT: v_rndne_f32_e32 v2, v1
+; GFX950-NEXT: v_sub_f32_e32 v3, v1, v2
+; GFX950-NEXT: v_fma_f32 v1, v0, s0, -v1
+; GFX950-NEXT: v_fmamk_f32 v1, v0, 0x33979a37, v1
+; GFX950-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX950-NEXT: v_exp_f32_e32 v1, v1
+; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX950-NEXT: s_mov_b32 s0, 0xc23369f4
+; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x421a209b
+; GFX950-NEXT: v_ldexp_f32 v1, v1, v2
+; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX950-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_exp10_bf16:
; GFX10: ; %bb.0:
@@ -29059,19 +30932,27 @@ define bfloat @v_ceil_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_ceil_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_ceil_f32_e32 v0, v0
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_ceil_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_ceil_f32_e32 v0, v0
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_ceil_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_ceil_f32_e32 v0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_ceil_bf16:
; GFX10: ; %bb.0:
@@ -29157,19 +31038,27 @@ define bfloat @v_trunc_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_trunc_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_trunc_f32_e32 v0, v0
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_trunc_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_trunc_f32_e32 v0, v0
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_trunc_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_trunc_f32_e32 v0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_trunc_bf16:
; GFX10: ; %bb.0:
@@ -29255,19 +31144,27 @@ define bfloat @v_rint_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_rint_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_rndne_f32_e32 v0, v0
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_rint_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_rndne_f32_e32 v0, v0
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_rint_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_rndne_f32_e32 v0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rint_bf16:
; GFX10: ; %bb.0:
@@ -29353,19 +31250,27 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_nearbyint_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_rndne_f32_e32 v0, v0
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_nearbyint_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_rndne_f32_e32 v0, v0
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_nearbyint_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_rndne_f32_e32 v0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_nearbyint_bf16:
; GFX10: ; %bb.0:
@@ -29469,25 +31374,40 @@ define bfloat @v_round_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_round_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_trunc_f32_e32 v1, v0
-; GFX9-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0
-; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_round_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_trunc_f32_e32 v1, v0
+; GFX900-NEXT: v_sub_f32_e32 v2, v0, v1
+; GFX900-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX900-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX900-NEXT: s_brev_b32 s4, -2
+; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0
+; GFX900-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_round_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_trunc_f32_e32 v1, v0
+; GFX950-NEXT: v_sub_f32_e32 v2, v0, v1
+; GFX950-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX950-NEXT: s_brev_b32 s0, -2
+; GFX950-NEXT: v_bfi_b32 v0, s0, v2, v0
+; GFX950-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_round_bf16:
; GFX10: ; %bb.0:
@@ -29592,19 +31512,27 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_roundeven_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_rndne_f32_e32 v0, v0
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_roundeven_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_rndne_f32_e32 v0, v0
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_roundeven_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_rndne_f32_e32 v0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_bf16:
; GFX10: ; %bb.0:
@@ -29690,19 +31618,27 @@ define bfloat @v_floor_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_floor_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_floor_f32_e32 v0, v0
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_floor_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_floor_f32_e32 v0, v0
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_floor_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_floor_f32_e32 v0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_floor_bf16:
; GFX10: ; %bb.0:
@@ -29786,19 +31722,27 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_canonicalize_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_canonicalize_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_canonicalize_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_canonicalize_bf16:
; GFX10: ; %bb.0:
@@ -29929,14 +31873,24 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fcmp_oeq_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_oeq_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_oeq_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_oeq_bf16:
; GFX10: ; %bb.0:
@@ -30004,14 +31958,24 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fcmp_ogt_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_ogt_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_ogt_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ogt_bf16:
; GFX10: ; %bb.0:
@@ -30079,14 +32043,24 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fcmp_oge_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_oge_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_oge_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_oge_bf16:
; GFX10: ; %bb.0:
@@ -30154,14 +32128,24 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fcmp_olt_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_olt_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_olt_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_olt_bf16:
; GFX10: ; %bb.0:
@@ -30229,14 +32213,24 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fcmp_ole_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_ole_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_ole_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ole_bf16:
; GFX10: ; %bb.0:
@@ -30304,14 +32298,24 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fcmp_one_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_one_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_one_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_one_bf16:
; GFX10: ; %bb.0:
@@ -30379,14 +32383,24 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fcmp_uno_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_uno_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_uno_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_uno_bf16:
; GFX10: ; %bb.0:
@@ -30454,14 +32468,24 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fcmp_ueq_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_ueq_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_ueq_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ueq_bf16:
; GFX10: ; %bb.0:
@@ -30529,14 +32553,24 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fcmp_ugt_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_ugt_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_ugt_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ugt_bf16:
; GFX10: ; %bb.0:
@@ -30604,14 +32638,24 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fcmp_uge_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_uge_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_uge_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_uge_bf16:
; GFX10: ; %bb.0:
@@ -30679,14 +32723,24 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fcmp_ult_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_ult_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_ult_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ult_bf16:
; GFX10: ; %bb.0:
@@ -30754,14 +32808,24 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fcmp_ule_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_ule_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_ule_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ule_bf16:
; GFX10: ; %bb.0:
@@ -30829,14 +32893,24 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fcmp_une_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_une_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_une_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_une_bf16:
; GFX10: ; %bb.0:
@@ -31011,16 +33085,27 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fptosi_v2bf16_to_v2i16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX900-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX950-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v0, v0, v1, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v2bf16_to_v2i16:
; GFX10: ; %bb.0:
@@ -31110,18 +33195,31 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX900-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX950-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v0, v0, v2, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v3bf16_to_v3i16:
; GFX10: ; %bb.0:
@@ -31232,21 +33330,37 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX900-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX900-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX900-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX950-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX950-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX950-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v0, v0, v3, s0
+; GFX950-NEXT: v_perm_b32 v1, v1, v2, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16:
; GFX10: ; %bb.0:
@@ -31663,24 +33777,44 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fptosi_bf16_to_i64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_trunc_f32_e32 v0, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
-; GFX9-NEXT: v_mul_f32_e64 v1, |v0|, s4
-; GFX9-NEXT: v_floor_f32_e32 v1, v1
-; GFX9-NEXT: s_mov_b32 s4, 0xcf800000
-; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1
-; GFX9-NEXT: v_fma_f32 v1, v1, s4, |v0|
-; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v0
-; GFX9-NEXT: v_xor_b32_e32 v2, v2, v3
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v3
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_bf16_to_i64:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_trunc_f32_e32 v0, v0
+; GFX900-NEXT: s_mov_b32 s4, 0x2f800000
+; GFX900-NEXT: v_mul_f32_e64 v1, |v0|, s4
+; GFX900-NEXT: v_floor_f32_e32 v1, v1
+; GFX900-NEXT: s_mov_b32 s4, 0xcf800000
+; GFX900-NEXT: v_cvt_u32_f32_e32 v2, v1
+; GFX900-NEXT: v_fma_f32 v1, v1, s4, |v0|
+; GFX900-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v0
+; GFX900-NEXT: v_xor_b32_e32 v2, v2, v3
+; GFX900-NEXT: v_xor_b32_e32 v0, v1, v3
+; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX900-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_bf16_to_i64:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_trunc_f32_e32 v0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x2f800000
+; GFX950-NEXT: v_mul_f32_e64 v1, |v0|, s0
+; GFX950-NEXT: v_floor_f32_e32 v1, v1
+; GFX950-NEXT: s_mov_b32 s0, 0xcf800000
+; GFX950-NEXT: v_cvt_u32_f32_e32 v2, v1
+; GFX950-NEXT: v_fma_f32 v1, v1, s0, |v0|
+; GFX950-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v0
+; GFX950-NEXT: v_xor_b32_e32 v2, v2, v3
+; GFX950-NEXT: v_xor_b32_e32 v0, v1, v3
+; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_bf16_to_i64:
; GFX10: ; %bb.0:
@@ -31845,36 +33979,69 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fptosi_v2bf16_to_v2i64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_trunc_f32_e32 v1, v1
-; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
-; GFX9-NEXT: v_mul_f32_e64 v2, |v1|, s4
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_floor_f32_e32 v2, v2
-; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
-; GFX9-NEXT: v_trunc_f32_e32 v4, v0
-; GFX9-NEXT: v_fma_f32 v3, v2, s5, |v1|
-; GFX9-NEXT: v_mul_f32_e64 v0, |v4|, s4
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT: v_floor_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_fma_f32 v5, v0, s5, |v4|
-; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
-; GFX9-NEXT: v_xor_b32_e32 v3, v3, v1
-; GFX9-NEXT: v_xor_b32_e32 v2, v2, v1
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1
-; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v4
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT: v_xor_b32_e32 v2, v5, v3
-; GFX9-NEXT: v_xor_b32_e32 v4, v6, v3
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_v2bf16_to_v2i64:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX900-NEXT: v_trunc_f32_e32 v1, v1
+; GFX900-NEXT: s_mov_b32 s4, 0x2f800000
+; GFX900-NEXT: v_mul_f32_e64 v2, |v1|, s4
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_floor_f32_e32 v2, v2
+; GFX900-NEXT: s_mov_b32 s5, 0xcf800000
+; GFX900-NEXT: v_trunc_f32_e32 v4, v0
+; GFX900-NEXT: v_fma_f32 v3, v2, s5, |v1|
+; GFX900-NEXT: v_mul_f32_e64 v0, |v4|, s4
+; GFX900-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX900-NEXT: v_floor_f32_e32 v0, v0
+; GFX900-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX900-NEXT: v_fma_f32 v5, v0, s5, |v4|
+; GFX900-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1
+; GFX900-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX900-NEXT: v_xor_b32_e32 v3, v3, v1
+; GFX900-NEXT: v_xor_b32_e32 v2, v2, v1
+; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1
+; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v4
+; GFX900-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX900-NEXT: v_xor_b32_e32 v2, v5, v3
+; GFX900-NEXT: v_xor_b32_e32 v4, v6, v3
+; GFX900-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX900-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_v2bf16_to_v2i64:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX950-NEXT: v_trunc_f32_e32 v1, v1
+; GFX950-NEXT: s_mov_b32 s0, 0x2f800000
+; GFX950-NEXT: v_mul_f32_e64 v2, |v1|, s0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT: v_floor_f32_e32 v2, v2
+; GFX950-NEXT: s_mov_b32 s1, 0xcf800000
+; GFX950-NEXT: v_trunc_f32_e32 v4, v0
+; GFX950-NEXT: v_fma_f32 v3, v2, s1, |v1|
+; GFX950-NEXT: v_mul_f32_e64 v0, |v4|, s0
+; GFX950-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX950-NEXT: v_floor_f32_e32 v0, v0
+; GFX950-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX950-NEXT: v_fma_f32 v5, v0, s1, |v4|
+; GFX950-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GFX950-NEXT: v_ashrrev_i32_e32 v1, 31, v1
+; GFX950-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX950-NEXT: v_xor_b32_e32 v3, v3, v1
+; GFX950-NEXT: v_xor_b32_e32 v2, v2, v1
+; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1
+; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX950-NEXT: v_xor_b32_e32 v2, v5, v3
+; GFX950-NEXT: v_xor_b32_e32 v4, v6, v3
+; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v2bf16_to_v2i64:
; GFX10: ; %bb.0:
@@ -32082,49 +34249,96 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
; GFX8-NEXT: v_mov_b32_e32 v1, v6
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fptosi_v3bf16_to_v3i64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT: v_trunc_f32_e32 v2, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
-; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4
-; GFX9-NEXT: v_floor_f32_e32 v3, v3
-; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
-; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2|
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT: v_trunc_f32_e32 v5, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4
-; GFX9-NEXT: v_floor_f32_e32 v0, v0
-; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
-; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5|
-; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v0
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
-; GFX9-NEXT: v_trunc_f32_e32 v1, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc
-; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
-; GFX9-NEXT: v_mul_f32_e64 v5, |v1|, s4
-; GFX9-NEXT: v_floor_f32_e32 v5, v5
-; GFX9-NEXT: v_xor_b32_e32 v2, v7, v3
-; GFX9-NEXT: v_fma_f32 v7, v5, s5, |v1|
-; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT: v_xor_b32_e32 v4, v8, v3
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX9-NEXT: v_xor_b32_e32 v4, v7, v1
-; GFX9-NEXT: v_xor_b32_e32 v5, v5, v1
-; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v1, v6
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_v3bf16_to_v3i64:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT: v_trunc_f32_e32 v2, v2
+; GFX900-NEXT: s_mov_b32 s4, 0x2f800000
+; GFX900-NEXT: v_mul_f32_e64 v3, |v2|, s4
+; GFX900-NEXT: v_floor_f32_e32 v3, v3
+; GFX900-NEXT: s_mov_b32 s5, 0xcf800000
+; GFX900-NEXT: v_fma_f32 v4, v3, s5, |v2|
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GFX900-NEXT: v_trunc_f32_e32 v5, v0
+; GFX900-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX900-NEXT: v_mul_f32_e64 v0, |v5|, s4
+; GFX900-NEXT: v_floor_f32_e32 v0, v0
+; GFX900-NEXT: v_ashrrev_i32_e32 v2, 31, v2
+; GFX900-NEXT: v_fma_f32 v6, v0, s5, |v5|
+; GFX900-NEXT: v_xor_b32_e32 v4, v4, v2
+; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_xor_b32_e32 v3, v3, v2
+; GFX900-NEXT: v_cvt_u32_f32_e32 v8, v0
+; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
+; GFX900-NEXT: v_trunc_f32_e32 v1, v1
+; GFX900-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc
+; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v5
+; GFX900-NEXT: v_mul_f32_e64 v5, |v1|, s4
+; GFX900-NEXT: v_floor_f32_e32 v5, v5
+; GFX900-NEXT: v_xor_b32_e32 v2, v7, v3
+; GFX900-NEXT: v_fma_f32 v7, v5, s5, |v1|
+; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v7
+; GFX900-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GFX900-NEXT: v_xor_b32_e32 v4, v8, v3
+; GFX900-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1
+; GFX900-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX900-NEXT: v_xor_b32_e32 v4, v7, v1
+; GFX900-NEXT: v_xor_b32_e32 v5, v5, v1
+; GFX900-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1
+; GFX900-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
+; GFX900-NEXT: v_mov_b32_e32 v1, v6
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_v3bf16_to_v3i64:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT: v_trunc_f32_e32 v2, v2
+; GFX950-NEXT: s_mov_b32 s0, 0x2f800000
+; GFX950-NEXT: v_mul_f32_e64 v3, |v2|, s0
+; GFX950-NEXT: v_floor_f32_e32 v3, v3
+; GFX950-NEXT: s_mov_b32 s1, 0xcf800000
+; GFX950-NEXT: v_fma_f32 v4, v3, s1, |v2|
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GFX950-NEXT: v_trunc_f32_e32 v5, v0
+; GFX950-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX950-NEXT: v_mul_f32_e64 v0, |v5|, s0
+; GFX950-NEXT: v_floor_f32_e32 v0, v0
+; GFX950-NEXT: v_ashrrev_i32_e32 v2, 31, v2
+; GFX950-NEXT: v_fma_f32 v6, v0, s1, |v5|
+; GFX950-NEXT: v_xor_b32_e32 v4, v4, v2
+; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_xor_b32_e32 v3, v3, v2
+; GFX950-NEXT: v_cvt_u32_f32_e32 v8, v0
+; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
+; GFX950-NEXT: v_trunc_f32_e32 v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc
+; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v5
+; GFX950-NEXT: v_mul_f32_e64 v5, |v1|, s0
+; GFX950-NEXT: v_floor_f32_e32 v5, v5
+; GFX950-NEXT: v_xor_b32_e32 v2, v7, v3
+; GFX950-NEXT: v_fma_f32 v7, v5, s1, |v1|
+; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v7
+; GFX950-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GFX950-NEXT: v_xor_b32_e32 v4, v8, v3
+; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX950-NEXT: v_ashrrev_i32_e32 v1, 31, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX950-NEXT: v_xor_b32_e32 v4, v7, v1
+; GFX950-NEXT: v_xor_b32_e32 v5, v5, v1
+; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
+; GFX950-NEXT: v_mov_b32_e32 v1, v6
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v3bf16_to_v3i64:
; GFX10: ; %bb.0:
@@ -32393,61 +34607,120 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fptosi_v4bf16_to_v4i64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT: v_trunc_f32_e32 v2, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
-; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4
-; GFX9-NEXT: v_floor_f32_e32 v3, v3
-; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
-; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2|
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT: v_trunc_f32_e32 v5, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4
-; GFX9-NEXT: v_floor_f32_e32 v0, v0
-; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
-; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5|
-; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc
-; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_trunc_f32_e32 v5, v5
-; GFX9-NEXT: v_xor_b32_e32 v2, v6, v3
-; GFX9-NEXT: v_mul_f32_e64 v6, |v5|, s4
-; GFX9-NEXT: v_floor_f32_e32 v6, v6
-; GFX9-NEXT: v_xor_b32_e32 v4, v7, v3
-; GFX9-NEXT: v_fma_f32 v7, v6, s5, |v5|
-; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT: v_trunc_f32_e32 v1, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX9-NEXT: v_xor_b32_e32 v4, v7, v5
-; GFX9-NEXT: v_mul_f32_e64 v7, |v1|, s4
-; GFX9-NEXT: v_floor_f32_e32 v7, v7
-; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT: v_fma_f32 v9, v7, s5, |v1|
-; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT: v_xor_b32_e32 v6, v6, v5
-; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-NEXT: v_xor_b32_e32 v6, v9, v1
-; GFX9-NEXT: v_xor_b32_e32 v7, v7, v1
-; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v1, v8
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_v4bf16_to_v4i64:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT: v_trunc_f32_e32 v2, v2
+; GFX900-NEXT: s_mov_b32 s4, 0x2f800000
+; GFX900-NEXT: v_mul_f32_e64 v3, |v2|, s4
+; GFX900-NEXT: v_floor_f32_e32 v3, v3
+; GFX900-NEXT: s_mov_b32 s5, 0xcf800000
+; GFX900-NEXT: v_fma_f32 v4, v3, s5, |v2|
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GFX900-NEXT: v_trunc_f32_e32 v5, v0
+; GFX900-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX900-NEXT: v_mul_f32_e64 v0, |v5|, s4
+; GFX900-NEXT: v_floor_f32_e32 v0, v0
+; GFX900-NEXT: v_ashrrev_i32_e32 v2, 31, v2
+; GFX900-NEXT: v_fma_f32 v6, v0, s5, |v5|
+; GFX900-NEXT: v_xor_b32_e32 v4, v4, v2
+; GFX900-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GFX900-NEXT: v_xor_b32_e32 v3, v3, v2
+; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v0
+; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
+; GFX900-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc
+; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_trunc_f32_e32 v5, v5
+; GFX900-NEXT: v_xor_b32_e32 v2, v6, v3
+; GFX900-NEXT: v_mul_f32_e64 v6, |v5|, s4
+; GFX900-NEXT: v_floor_f32_e32 v6, v6
+; GFX900-NEXT: v_xor_b32_e32 v4, v7, v3
+; GFX900-NEXT: v_fma_f32 v7, v6, s5, |v5|
+; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v7
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX900-NEXT: v_ashrrev_i32_e32 v5, 31, v5
+; GFX900-NEXT: v_trunc_f32_e32 v1, v1
+; GFX900-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX900-NEXT: v_xor_b32_e32 v4, v7, v5
+; GFX900-NEXT: v_mul_f32_e64 v7, |v1|, s4
+; GFX900-NEXT: v_floor_f32_e32 v7, v7
+; GFX900-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GFX900-NEXT: v_fma_f32 v9, v7, s5, |v1|
+; GFX900-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v7
+; GFX900-NEXT: v_xor_b32_e32 v6, v6, v5
+; GFX900-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5
+; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1
+; GFX900-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; GFX900-NEXT: v_xor_b32_e32 v6, v9, v1
+; GFX900-NEXT: v_xor_b32_e32 v7, v7, v1
+; GFX900-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1
+; GFX900-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX900-NEXT: v_mov_b32_e32 v1, v8
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_v4bf16_to_v4i64:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT: v_trunc_f32_e32 v2, v2
+; GFX950-NEXT: s_mov_b32 s0, 0x2f800000
+; GFX950-NEXT: v_mul_f32_e64 v3, |v2|, s0
+; GFX950-NEXT: v_floor_f32_e32 v3, v3
+; GFX950-NEXT: s_mov_b32 s1, 0xcf800000
+; GFX950-NEXT: v_fma_f32 v4, v3, s1, |v2|
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GFX950-NEXT: v_trunc_f32_e32 v5, v0
+; GFX950-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX950-NEXT: v_mul_f32_e64 v0, |v5|, s0
+; GFX950-NEXT: v_floor_f32_e32 v0, v0
+; GFX950-NEXT: v_ashrrev_i32_e32 v2, 31, v2
+; GFX950-NEXT: v_fma_f32 v6, v0, s1, |v5|
+; GFX950-NEXT: v_xor_b32_e32 v4, v4, v2
+; GFX950-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GFX950-NEXT: v_xor_b32_e32 v3, v3, v2
+; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v0
+; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc
+; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: v_trunc_f32_e32 v5, v5
+; GFX950-NEXT: v_xor_b32_e32 v2, v6, v3
+; GFX950-NEXT: v_mul_f32_e64 v6, |v5|, s0
+; GFX950-NEXT: v_floor_f32_e32 v6, v6
+; GFX950-NEXT: v_xor_b32_e32 v4, v7, v3
+; GFX950-NEXT: v_fma_f32 v7, v6, s1, |v5|
+; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v7
+; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX950-NEXT: v_ashrrev_i32_e32 v5, 31, v5
+; GFX950-NEXT: v_trunc_f32_e32 v1, v1
+; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX950-NEXT: v_xor_b32_e32 v4, v7, v5
+; GFX950-NEXT: v_mul_f32_e64 v7, |v1|, s0
+; GFX950-NEXT: v_floor_f32_e32 v7, v7
+; GFX950-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GFX950-NEXT: v_fma_f32 v9, v7, s1, |v1|
+; GFX950-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v7
+; GFX950-NEXT: v_xor_b32_e32 v6, v6, v5
+; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5
+; GFX950-NEXT: v_ashrrev_i32_e32 v1, 31, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; GFX950-NEXT: v_xor_b32_e32 v6, v9, v1
+; GFX950-NEXT: v_xor_b32_e32 v7, v7, v1
+; GFX950-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX950-NEXT: v_mov_b32_e32 v1, v8
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v4bf16_to_v4i64:
; GFX10: ; %bb.0:
@@ -32594,18 +34867,25 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_sitofp_i16_to_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_i16_to_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_i16_to_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_i16_to_bf16:
; GFX10: ; %bb.0:
@@ -32698,25 +34978,33 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v2i16_to_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v2i16_to_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16:
; GFX10: ; %bb.0:
@@ -32846,32 +35134,42 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16:
; GFX10: ; %bb.0:
@@ -33042,38 +35340,49 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v4i16_to_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v4i16_to_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16:
; GFX10: ; %bb.0:
@@ -33219,18 +35528,25 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_sitofp_i32_to_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_i32_to_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_i32_to_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_i32_to_bf16:
; GFX10: ; %bb.0:
@@ -33315,25 +35631,33 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v2i32_to_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v2i32_to_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v2i32_to_v2bf16:
; GFX10: ; %bb.0:
@@ -33452,32 +35776,42 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX950-NEXT: v_cvt_f32_i32_e32 v3, v1
+; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16:
; GFX10: ; %bb.0:
@@ -33629,38 +35963,49 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v4i32_to_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX900-NEXT: v_cvt_f32_i32_e32 v3, v3
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX900-NEXT: v_add3_u32 v4, v4, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX900-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX900-NEXT: v_add3_u32 v4, v4, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v4i32_to_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_i32_e32 v3, v3
+; GFX950-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16:
; GFX10: ; %bb.0:
@@ -33827,29 +36172,47 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_sitofp_i64_to_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_xor_b32_e32 v2, v0, v1
-; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
-; GFX9-NEXT: v_ffbh_i32_e32 v3, v1
-; GFX9-NEXT: v_add_u32_e32 v2, 32, v2
-; GFX9-NEXT: v_add_u32_e32 v3, -1, v3
-; GFX9-NEXT: v_min_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_i64_to_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_xor_b32_e32 v2, v0, v1
+; GFX900-NEXT: v_ashrrev_i32_e32 v2, 31, v2
+; GFX900-NEXT: v_ffbh_i32_e32 v3, v1
+; GFX900-NEXT: v_add_u32_e32 v2, 32, v2
+; GFX900-NEXT: v_add_u32_e32 v3, -1, v3
+; GFX900-NEXT: v_min_u32_e32 v2, v3, v2
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT: v_sub_u32_e32 v1, 32, v2
+; GFX900-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_i64_to_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v2, v0, v1
+; GFX950-NEXT: v_ashrrev_i32_e32 v2, 31, v2
+; GFX950-NEXT: v_ffbh_i32_e32 v3, v1
+; GFX950-NEXT: v_add_u32_e32 v2, 32, v2
+; GFX950-NEXT: v_add_u32_e32 v3, -1, v3
+; GFX950-NEXT: v_min_u32_e32 v2, v3, v2
+; GFX950-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX950-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT: v_sub_u32_e32 v1, 32, v2
+; GFX950-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_i64_to_bf16:
; GFX10: ; %bb.0:
@@ -34044,47 +36407,77 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_xor_b32_e32 v5, v0, v1
-; GFX9-NEXT: v_ffbh_i32_e32 v4, v1
-; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT: v_add_u32_e32 v4, -1, v4
-; GFX9-NEXT: v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT: v_min_u32_e32 v4, v4, v5
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4
-; GFX9-NEXT: v_ldexp_f32 v4, v0, v1
-; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1
-; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
-; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4
-; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
-; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT: v_min_u32_e32 v6, v0, v1
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v2i64_to_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_xor_b32_e32 v5, v0, v1
+; GFX900-NEXT: v_ffbh_i32_e32 v4, v1
+; GFX900-NEXT: v_ashrrev_i32_e32 v5, 31, v5
+; GFX900-NEXT: v_add_u32_e32 v4, -1, v4
+; GFX900-NEXT: v_add_u32_e32 v5, 32, v5
+; GFX900-NEXT: v_min_u32_e32 v4, v4, v5
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT: v_sub_u32_e32 v1, 32, v4
+; GFX900-NEXT: v_ldexp_f32 v4, v0, v1
+; GFX900-NEXT: v_bfe_u32 v0, v4, 16, 1
+; GFX900-NEXT: v_xor_b32_e32 v1, v2, v3
+; GFX900-NEXT: v_add3_u32 v5, v0, v4, s4
+; GFX900-NEXT: v_ffbh_i32_e32 v0, v3
+; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1
+; GFX900-NEXT: v_add_u32_e32 v0, -1, v0
+; GFX900-NEXT: v_add_u32_e32 v1, 32, v1
+; GFX900-NEXT: v_min_u32_e32 v6, v0, v1
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX900-NEXT: v_sub_u32_e32 v2, 32, v6
+; GFX900-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v2i64_to_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v5, v2, v3
+; GFX950-NEXT: v_ffbh_i32_e32 v4, v3
+; GFX950-NEXT: v_ashrrev_i32_e32 v5, 31, v5
+; GFX950-NEXT: v_add_u32_e32 v4, -1, v4
+; GFX950-NEXT: v_add_u32_e32 v5, 32, v5
+; GFX950-NEXT: v_min_u32_e32 v4, v4, v5
+; GFX950-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX950-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX950-NEXT: v_xor_b32_e32 v5, v0, v1
+; GFX950-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX950-NEXT: v_ffbh_i32_e32 v3, v1
+; GFX950-NEXT: v_ashrrev_i32_e32 v5, 31, v5
+; GFX950-NEXT: v_add_u32_e32 v3, -1, v3
+; GFX950-NEXT: v_add_u32_e32 v5, 32, v5
+; GFX950-NEXT: v_min_u32_e32 v3, v3, v5
+; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX950-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX950-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT: v_sub_u32_e32 v1, 32, v4
+; GFX950-NEXT: v_ldexp_f32 v1, v2, v1
+; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3
+; GFX950-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v2i64_to_v2bf16:
; GFX10: ; %bb.0:
@@ -34386,65 +36779,109 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_xor_b32_e32 v7, v4, v5
-; GFX9-NEXT: v_ffbh_i32_e32 v6, v5
-; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7
-; GFX9-NEXT: v_add_u32_e32 v6, -1, v6
-; GFX9-NEXT: v_add_u32_e32 v7, 32, v7
-; GFX9-NEXT: v_min_u32_e32 v6, v6, v7
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT: v_xor_b32_e32 v7, v0, v1
-; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6
-; GFX9-NEXT: v_ffbh_i32_e32 v6, v1
-; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7
-; GFX9-NEXT: v_add_u32_e32 v6, -1, v6
-; GFX9-NEXT: v_add_u32_e32 v7, 32, v7
-; GFX9-NEXT: v_min_u32_e32 v6, v6, v7
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
-; GFX9-NEXT: v_ldexp_f32 v5, v0, v1
-; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1
-; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
-; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4
-; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
-; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT: v_min_u32_e32 v7, v0, v1
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_xor_b32_e32 v7, v4, v5
+; GFX900-NEXT: v_ffbh_i32_e32 v6, v5
+; GFX900-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; GFX900-NEXT: v_add_u32_e32 v6, -1, v6
+; GFX900-NEXT: v_add_u32_e32 v7, 32, v7
+; GFX900-NEXT: v_min_u32_e32 v6, v6, v7
+; GFX900-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX900-NEXT: v_xor_b32_e32 v7, v0, v1
+; GFX900-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX900-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX900-NEXT: v_sub_u32_e32 v5, 32, v6
+; GFX900-NEXT: v_ffbh_i32_e32 v6, v1
+; GFX900-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; GFX900-NEXT: v_add_u32_e32 v6, -1, v6
+; GFX900-NEXT: v_add_u32_e32 v7, 32, v7
+; GFX900-NEXT: v_min_u32_e32 v6, v6, v7
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX900-NEXT: v_cvt_f32_i32_e32 v4, v4
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT: v_ldexp_f32 v4, v4, v5
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_sub_u32_e32 v1, 32, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
+; GFX900-NEXT: v_ldexp_f32 v5, v0, v1
+; GFX900-NEXT: v_bfe_u32 v0, v5, 16, 1
+; GFX900-NEXT: v_xor_b32_e32 v1, v2, v3
+; GFX900-NEXT: v_add3_u32 v6, v0, v5, s4
+; GFX900-NEXT: v_ffbh_i32_e32 v0, v3
+; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1
+; GFX900-NEXT: v_add_u32_e32 v0, -1, v0
+; GFX900-NEXT: v_add_u32_e32 v1, 32, v1
+; GFX900-NEXT: v_min_u32_e32 v7, v0, v1
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3]
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v5
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX900-NEXT: v_sub_u32_e32 v2, 32, v7
+; GFX900-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT: v_alignbit_b32 v1, s4, v4, 16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v7, v4, v5
+; GFX950-NEXT: v_ffbh_i32_e32 v6, v5
+; GFX950-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; GFX950-NEXT: v_add_u32_e32 v6, -1, v6
+; GFX950-NEXT: v_add_u32_e32 v7, 32, v7
+; GFX950-NEXT: v_min_u32_e32 v6, v6, v7
+; GFX950-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX950-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX950-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX950-NEXT: v_cvt_f32_i32_e32 v4, v4
+; GFX950-NEXT: v_sub_u32_e32 v5, 32, v6
+; GFX950-NEXT: v_xor_b32_e32 v6, v2, v3
+; GFX950-NEXT: v_ashrrev_i32_e32 v6, 31, v6
+; GFX950-NEXT: v_ldexp_f32 v4, v4, v5
+; GFX950-NEXT: v_ffbh_i32_e32 v5, v3
+; GFX950-NEXT: v_add_u32_e32 v5, -1, v5
+; GFX950-NEXT: v_add_u32_e32 v6, 32, v6
+; GFX950-NEXT: v_min_u32_e32 v5, v5, v6
+; GFX950-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
+; GFX950-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX950-NEXT: v_xor_b32_e32 v6, v0, v1
+; GFX950-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX950-NEXT: v_ffbh_i32_e32 v3, v1
+; GFX950-NEXT: v_ashrrev_i32_e32 v6, 31, v6
+; GFX950-NEXT: v_add_u32_e32 v3, -1, v3
+; GFX950-NEXT: v_add_u32_e32 v6, 32, v6
+; GFX950-NEXT: v_min_u32_e32 v3, v3, v6
+; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX950-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX950-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT: v_sub_u32_e32 v1, 32, v5
+; GFX950-NEXT: v_ldexp_f32 v1, v2, v1
+; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX950-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT: v_mov_b32_e32 v1, v4
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16:
; GFX10: ; %bb.0:
@@ -34842,82 +37279,137 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_xor_b32_e32 v9, v4, v5
-; GFX9-NEXT: v_ffbh_i32_e32 v8, v5
-; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v9
-; GFX9-NEXT: v_add_u32_e32 v8, -1, v8
-; GFX9-NEXT: v_add_u32_e32 v9, 32, v9
-; GFX9-NEXT: v_min_u32_e32 v8, v8, v9
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
-; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8
-; GFX9-NEXT: v_ldexp_f32 v8, v4, v5
-; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1
-; GFX9-NEXT: v_xor_b32_e32 v5, v6, v7
-; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4
-; GFX9-NEXT: v_ffbh_i32_e32 v4, v7
-; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT: v_add_u32_e32 v4, -1, v4
-; GFX9-NEXT: v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT: v_min_u32_e32 v10, v4, v5
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT: v_xor_b32_e32 v8, v0, v1
-; GFX9-NEXT: v_ffbh_i32_e32 v7, v1
-; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; GFX9-NEXT: v_add_u32_e32 v7, -1, v7
-; GFX9-NEXT: v_add_u32_e32 v8, 32, v8
-; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT: v_min_u32_e32 v7, v7, v8
-; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
-; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: v_ldexp_f32 v4, v4, v6
-; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc
-; GFX9-NEXT: v_ldexp_f32 v6, v0, v1
-; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1
-; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
-; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4
-; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
-; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT: v_min_u32_e32 v8, v0, v1
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v4i64_to_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_xor_b32_e32 v9, v4, v5
+; GFX900-NEXT: v_ffbh_i32_e32 v8, v5
+; GFX900-NEXT: v_ashrrev_i32_e32 v9, 31, v9
+; GFX900-NEXT: v_add_u32_e32 v8, -1, v8
+; GFX900-NEXT: v_add_u32_e32 v9, 32, v9
+; GFX900-NEXT: v_min_u32_e32 v8, v8, v9
+; GFX900-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX900-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX900-NEXT: v_cvt_f32_i32_e32 v4, v4
+; GFX900-NEXT: v_sub_u32_e32 v5, 32, v8
+; GFX900-NEXT: v_ldexp_f32 v8, v4, v5
+; GFX900-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX900-NEXT: v_xor_b32_e32 v5, v6, v7
+; GFX900-NEXT: v_add3_u32 v9, v4, v8, s4
+; GFX900-NEXT: v_ffbh_i32_e32 v4, v7
+; GFX900-NEXT: v_ashrrev_i32_e32 v5, 31, v5
+; GFX900-NEXT: v_add_u32_e32 v4, -1, v4
+; GFX900-NEXT: v_add_u32_e32 v5, 32, v5
+; GFX900-NEXT: v_min_u32_e32 v10, v4, v5
+; GFX900-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_xor_b32_e32 v8, v0, v1
+; GFX900-NEXT: v_ffbh_i32_e32 v7, v1
+; GFX900-NEXT: v_ashrrev_i32_e32 v8, 31, v8
+; GFX900-NEXT: v_add_u32_e32 v7, -1, v7
+; GFX900-NEXT: v_add_u32_e32 v8, 32, v8
+; GFX900-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX900-NEXT: v_min_u32_e32 v7, v7, v8
+; GFX900-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX900-NEXT: v_cvt_f32_i32_e32 v4, v4
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
+; GFX900-NEXT: v_sub_u32_e32 v6, 32, v10
+; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT: v_ldexp_f32 v4, v4, v6
+; GFX900-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX900-NEXT: v_add3_u32 v6, v6, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_sub_u32_e32 v1, 32, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc
+; GFX900-NEXT: v_ldexp_f32 v6, v0, v1
+; GFX900-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX900-NEXT: v_xor_b32_e32 v1, v2, v3
+; GFX900-NEXT: v_add3_u32 v7, v0, v6, s4
+; GFX900-NEXT: v_ffbh_i32_e32 v0, v3
+; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1
+; GFX900-NEXT: v_add_u32_e32 v0, -1, v0
+; GFX900-NEXT: v_add_u32_e32 v1, 32, v1
+; GFX900-NEXT: v_min_u32_e32 v8, v0, v1
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v6
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX900-NEXT: v_sub_u32_e32 v2, 32, v8
+; GFX900-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT: v_perm_b32 v1, v4, v5, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v4i64_to_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v9, v6, v7
+; GFX950-NEXT: v_ffbh_i32_e32 v8, v7
+; GFX950-NEXT: v_ashrrev_i32_e32 v9, 31, v9
+; GFX950-NEXT: v_add_u32_e32 v8, -1, v8
+; GFX950-NEXT: v_add_u32_e32 v9, 32, v9
+; GFX950-NEXT: v_min_u32_e32 v8, v8, v9
+; GFX950-NEXT: v_lshlrev_b64 v[6:7], v8, v[6:7]
+; GFX950-NEXT: v_min_u32_e32 v6, 1, v6
+; GFX950-NEXT: v_xor_b32_e32 v9, v4, v5
+; GFX950-NEXT: v_or_b32_e32 v6, v7, v6
+; GFX950-NEXT: v_ffbh_i32_e32 v7, v5
+; GFX950-NEXT: v_ashrrev_i32_e32 v9, 31, v9
+; GFX950-NEXT: v_add_u32_e32 v7, -1, v7
+; GFX950-NEXT: v_add_u32_e32 v9, 32, v9
+; GFX950-NEXT: v_min_u32_e32 v7, v7, v9
+; GFX950-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GFX950-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX950-NEXT: v_cvt_f32_i32_e32 v6, v6
+; GFX950-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX950-NEXT: v_cvt_f32_i32_e32 v4, v4
+; GFX950-NEXT: v_sub_u32_e32 v5, 32, v8
+; GFX950-NEXT: v_ldexp_f32 v5, v6, v5
+; GFX950-NEXT: v_sub_u32_e32 v6, 32, v7
+; GFX950-NEXT: v_xor_b32_e32 v7, v2, v3
+; GFX950-NEXT: v_ldexp_f32 v4, v4, v6
+; GFX950-NEXT: v_ffbh_i32_e32 v6, v3
+; GFX950-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; GFX950-NEXT: v_add_u32_e32 v6, -1, v6
+; GFX950-NEXT: v_add_u32_e32 v7, 32, v7
+; GFX950-NEXT: v_min_u32_e32 v6, v6, v7
+; GFX950-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3]
+; GFX950-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX950-NEXT: v_xor_b32_e32 v7, v0, v1
+; GFX950-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX950-NEXT: v_ffbh_i32_e32 v3, v1
+; GFX950-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; GFX950-NEXT: v_add_u32_e32 v3, -1, v3
+; GFX950-NEXT: v_add_u32_e32 v7, 32, v7
+; GFX950-NEXT: v_min_u32_e32 v3, v3, v7
+; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX950-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX950-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT: v_sub_u32_e32 v1, 32, v6
+; GFX950-NEXT: v_ldexp_f32 v1, v2, v1
+; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3
+; GFX950-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v4, v5
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
; GFX10: ; %bb.0:
@@ -35202,18 +37694,25 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_uitofp_i16_to_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_i16_to_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_i16_to_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_i16_to_bf16:
; GFX10: ; %bb.0:
@@ -35306,25 +37805,33 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v2i16_to_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v2i16_to_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v2i16_to_v2bf16:
; GFX10: ; %bb.0:
@@ -35457,32 +37964,42 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16:
; GFX10: ; %bb.0:
@@ -35656,38 +38173,49 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v4i16_to_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v4i16_to_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16:
; GFX10: ; %bb.0:
@@ -35838,18 +38366,25 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_uitofp_i32_to_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_i32_to_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_i32_to_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_i32_to_bf16:
; GFX10: ; %bb.0:
@@ -35934,25 +38469,33 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v2i32_to_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v2i32_to_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v2i32_to_v2bf16:
; GFX10: ; %bb.0:
@@ -36071,32 +38614,42 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX950-NEXT: v_cvt_f32_u32_e32 v3, v1
+; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16:
; GFX10: ; %bb.0:
@@ -36248,38 +38801,49 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v4i32_to_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX900-NEXT: v_cvt_f32_u32_e32 v3, v3
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX900-NEXT: v_add3_u32 v4, v4, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX900-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX900-NEXT: v_add3_u32 v4, v4, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v4i32_to_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cvt_f32_u32_e32 v3, v3
+; GFX950-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16:
; GFX10: ; %bb.0:
@@ -36434,25 +38998,39 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_uitofp_i64_to_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_ffbh_u32_e32 v2, v1
-; GFX9-NEXT: v_min_u32_e32 v2, 32, v2
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_i64_to_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_ffbh_u32_e32 v2, v1
+; GFX900-NEXT: v_min_u32_e32 v2, 32, v2
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT: v_sub_u32_e32 v1, 32, v2
+; GFX900-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_i64_to_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_ffbh_u32_e32 v2, v1
+; GFX950-NEXT: v_min_u32_e32 v2, 32, v2
+; GFX950-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX950-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT: v_sub_u32_e32 v1, 32, v2
+; GFX950-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_i64_to_bf16:
; GFX10: ; %bb.0:
@@ -36606,39 +39184,61 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_ffbh_u32_e32 v4, v1
-; GFX9-NEXT: v_min_u32_e32 v4, 32, v4
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4
-; GFX9-NEXT: v_ldexp_f32 v4, v0, v1
-; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1
-; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4
-; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT: v_min_u32_e32 v6, 32, v0
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v2i64_to_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_ffbh_u32_e32 v4, v1
+; GFX900-NEXT: v_min_u32_e32 v4, 32, v4
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT: v_sub_u32_e32 v1, 32, v4
+; GFX900-NEXT: v_ldexp_f32 v4, v0, v1
+; GFX900-NEXT: v_bfe_u32 v0, v4, 16, 1
+; GFX900-NEXT: v_add3_u32 v5, v0, v4, s4
+; GFX900-NEXT: v_ffbh_u32_e32 v0, v3
+; GFX900-NEXT: v_min_u32_e32 v6, 32, v0
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX900-NEXT: v_sub_u32_e32 v2, 32, v6
+; GFX900-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v2i64_to_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_ffbh_u32_e32 v4, v3
+; GFX950-NEXT: v_min_u32_e32 v4, 32, v4
+; GFX950-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX950-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX950-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX950-NEXT: v_ffbh_u32_e32 v3, v1
+; GFX950-NEXT: v_min_u32_e32 v3, 32, v3
+; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX950-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX950-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT: v_sub_u32_e32 v1, 32, v4
+; GFX950-NEXT: v_ldexp_f32 v1, v2, v1
+; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3
+; GFX950-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v2i64_to_v2bf16:
; GFX10: ; %bb.0:
@@ -36874,53 +39474,85 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_ffbh_u32_e32 v6, v5
-; GFX9-NEXT: v_min_u32_e32 v6, 32, v6
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6
-; GFX9-NEXT: v_ffbh_u32_e32 v6, v1
-; GFX9-NEXT: v_min_u32_e32 v6, 32, v6
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
-; GFX9-NEXT: v_ldexp_f32 v5, v0, v1
-; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1
-; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4
-; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT: v_min_u32_e32 v7, 32, v0
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_ffbh_u32_e32 v6, v5
+; GFX900-NEXT: v_min_u32_e32 v6, 32, v6
+; GFX900-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX900-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX900-NEXT: v_sub_u32_e32 v5, 32, v6
+; GFX900-NEXT: v_ffbh_u32_e32 v6, v1
+; GFX900-NEXT: v_min_u32_e32 v6, 32, v6
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX900-NEXT: v_cvt_f32_u32_e32 v4, v4
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT: v_ldexp_f32 v4, v4, v5
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_sub_u32_e32 v1, 32, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
+; GFX900-NEXT: v_ldexp_f32 v5, v0, v1
+; GFX900-NEXT: v_bfe_u32 v0, v5, 16, 1
+; GFX900-NEXT: v_add3_u32 v6, v0, v5, s4
+; GFX900-NEXT: v_ffbh_u32_e32 v0, v3
+; GFX900-NEXT: v_min_u32_e32 v7, 32, v0
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3]
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v5
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX900-NEXT: v_sub_u32_e32 v2, 32, v7
+; GFX900-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT: v_alignbit_b32 v1, s4, v4, 16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_ffbh_u32_e32 v6, v5
+; GFX950-NEXT: v_min_u32_e32 v6, 32, v6
+; GFX950-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX950-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX950-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX950-NEXT: v_cvt_f32_u32_e32 v4, v4
+; GFX950-NEXT: v_sub_u32_e32 v5, 32, v6
+; GFX950-NEXT: v_ldexp_f32 v4, v4, v5
+; GFX950-NEXT: v_ffbh_u32_e32 v5, v3
+; GFX950-NEXT: v_min_u32_e32 v5, 32, v5
+; GFX950-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
+; GFX950-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX950-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX950-NEXT: v_ffbh_u32_e32 v3, v1
+; GFX950-NEXT: v_min_u32_e32 v3, 32, v3
+; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX950-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX950-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT: v_sub_u32_e32 v1, 32, v5
+; GFX950-NEXT: v_ldexp_f32 v1, v2, v1
+; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX950-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT: v_mov_b32_e32 v1, v4
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16:
; GFX10: ; %bb.0:
@@ -37236,66 +39868,105 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_ffbh_u32_e32 v8, v5
-; GFX9-NEXT: v_min_u32_e32 v8, 32, v8
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
-; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8
-; GFX9-NEXT: v_ldexp_f32 v8, v4, v5
-; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1
-; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4
-; GFX9-NEXT: v_ffbh_u32_e32 v4, v7
-; GFX9-NEXT: v_min_u32_e32 v10, 32, v4
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX9-NEXT: v_ffbh_u32_e32 v7, v1
-; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT: v_min_u32_e32 v7, 32, v7
-; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
-; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_ldexp_f32 v4, v4, v6
-; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc
-; GFX9-NEXT: v_ldexp_f32 v6, v0, v1
-; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1
-; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4
-; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT: v_min_u32_e32 v8, 32, v0
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6
-; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v4i64_to_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_ffbh_u32_e32 v8, v5
+; GFX900-NEXT: v_min_u32_e32 v8, 32, v8
+; GFX900-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX900-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX900-NEXT: v_cvt_f32_u32_e32 v4, v4
+; GFX900-NEXT: v_sub_u32_e32 v5, 32, v8
+; GFX900-NEXT: v_ldexp_f32 v8, v4, v5
+; GFX900-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX900-NEXT: v_add3_u32 v9, v4, v8, s4
+; GFX900-NEXT: v_ffbh_u32_e32 v4, v7
+; GFX900-NEXT: v_min_u32_e32 v10, 32, v4
+; GFX900-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX900-NEXT: v_ffbh_u32_e32 v7, v1
+; GFX900-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX900-NEXT: v_min_u32_e32 v7, 32, v7
+; GFX900-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX900-NEXT: v_cvt_f32_u32_e32 v4, v4
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
+; GFX900-NEXT: v_sub_u32_e32 v6, 32, v10
+; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT: v_ldexp_f32 v4, v4, v6
+; GFX900-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX900-NEXT: v_add3_u32 v6, v6, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_sub_u32_e32 v1, 32, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc
+; GFX900-NEXT: v_ldexp_f32 v6, v0, v1
+; GFX900-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX900-NEXT: v_add3_u32 v7, v0, v6, s4
+; GFX900-NEXT: v_ffbh_u32_e32 v0, v3
+; GFX900-NEXT: v_min_u32_e32 v8, 32, v0
+; GFX900-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v6
+; GFX900-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX900-NEXT: v_sub_u32_e32 v2, 32, v8
+; GFX900-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT: v_perm_b32 v1, v4, v5, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v4i64_to_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_ffbh_u32_e32 v8, v7
+; GFX950-NEXT: v_min_u32_e32 v8, 32, v8
+; GFX950-NEXT: v_lshlrev_b64 v[6:7], v8, v[6:7]
+; GFX950-NEXT: v_min_u32_e32 v6, 1, v6
+; GFX950-NEXT: v_or_b32_e32 v6, v7, v6
+; GFX950-NEXT: v_ffbh_u32_e32 v7, v5
+; GFX950-NEXT: v_min_u32_e32 v7, 32, v7
+; GFX950-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GFX950-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX950-NEXT: v_cvt_f32_u32_e32 v6, v6
+; GFX950-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX950-NEXT: v_cvt_f32_u32_e32 v4, v4
+; GFX950-NEXT: v_sub_u32_e32 v5, 32, v8
+; GFX950-NEXT: v_ldexp_f32 v5, v6, v5
+; GFX950-NEXT: v_sub_u32_e32 v6, 32, v7
+; GFX950-NEXT: v_ldexp_f32 v4, v4, v6
+; GFX950-NEXT: v_ffbh_u32_e32 v6, v3
+; GFX950-NEXT: v_min_u32_e32 v6, 32, v6
+; GFX950-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3]
+; GFX950-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX950-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX950-NEXT: v_ffbh_u32_e32 v3, v1
+; GFX950-NEXT: v_min_u32_e32 v3, 32, v3
+; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX950-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX950-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT: v_sub_u32_e32 v1, 32, v6
+; GFX950-NEXT: v_ldexp_f32 v1, v2, v1
+; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3
+; GFX950-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v4, v5
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16:
; GFX10: ; %bb.0:
@@ -37531,13 +40202,22 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_select_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_bf16:
; GFX10: ; %bb.0:
@@ -37600,14 +40280,24 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_select_fneg_lhs_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_fneg_lhs_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_fneg_lhs_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_fneg_lhs_bf16:
; GFX10: ; %bb.0:
@@ -37674,14 +40364,24 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_select_fneg_rhs_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_fneg_rhs_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_fneg_rhs_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_fneg_rhs_bf16:
; GFX10: ; %bb.0:
@@ -37765,16 +40465,28 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_select_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v2bf16:
; GFX10: ; %bb.0:
@@ -37859,18 +40571,32 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_vselect_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_vselect_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX900-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; GFX900-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_vselect_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[0:1]
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v2bf16:
; GFX10: ; %bb.0:
@@ -37946,15 +40672,27 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
-; GFX9-LABEL: s_select_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
+; GFX900-LABEL: s_select_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: v_mov_b32_e32 v2, s0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX900-NEXT: v_readfirstlane_b32 s0, v0
+; GFX900-NEXT: ; return to shader part epilog
+;
+; GFX950-LABEL: s_select_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: v_mov_b32_e32 v1, s1
+; GFX950-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_select_bf16:
; GFX10: ; %bb.0:
@@ -38046,21 +40784,39 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
-; GFX9-LABEL: s_select_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s2, s0, 16
-; GFX9-NEXT: s_lshr_b32 s3, s1, 16
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: s_mov_b32 s0, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
+; GFX900-LABEL: s_select_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_lshr_b32 s2, s0, 16
+; GFX900-NEXT: s_lshr_b32 s3, s1, 16
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: v_mov_b32_e32 v2, s0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX900-NEXT: s_mov_b32 s0, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v0, v1, s0
+; GFX900-NEXT: v_readfirstlane_b32 s0, v0
+; GFX900-NEXT: ; return to shader part epilog
+;
+; GFX950-LABEL: s_select_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_lshr_b32 s2, s0, 16
+; GFX950-NEXT: s_lshr_b32 s3, s1, 16
+; GFX950-NEXT: v_mov_b32_e32 v1, s3
+; GFX950-NEXT: v_mov_b32_e32 v2, s2
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX950-NEXT: v_mov_b32_e32 v1, s1
+; GFX950-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v0, v0, v1, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_select_v2bf16:
; GFX10: ; %bb.0:
@@ -38159,22 +40915,42 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
-; GFX9-LABEL: s_vselect_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s2, s0, 16
-; GFX9-NEXT: s_lshr_b32 s3, s1, 16
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v3, s0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: s_mov_b32 s0, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
+; GFX900-LABEL: s_vselect_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_lshr_b32 s2, s0, 16
+; GFX900-NEXT: s_lshr_b32 s3, s1, 16
+; GFX900-NEXT: v_mov_b32_e32 v2, s3
+; GFX900-NEXT: v_mov_b32_e32 v3, s2
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX900-NEXT: v_mov_b32_e32 v2, s1
+; GFX900-NEXT: v_mov_b32_e32 v3, s0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: s_mov_b32 s0, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v1, v0, s0
+; GFX900-NEXT: v_readfirstlane_b32 s0, v0
+; GFX900-NEXT: ; return to shader part epilog
+;
+; GFX950-LABEL: s_vselect_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_lshr_b32 s2, s0, 16
+; GFX950-NEXT: s_lshr_b32 s3, s1, 16
+; GFX950-NEXT: v_mov_b32_e32 v2, s3
+; GFX950-NEXT: v_mov_b32_e32 v3, s2
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX950-NEXT: v_mov_b32_e32 v2, s1
+; GFX950-NEXT: v_mov_b32_e32 v3, s0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_vselect_v2bf16:
; GFX10: ; %bb.0:
@@ -38285,14 +41061,24 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_select_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v3bf16:
; GFX10: ; %bb.0:
@@ -38383,14 +41169,24 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_select_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v4bf16:
; GFX10: ; %bb.0:
@@ -38504,15 +41300,26 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_select_v6bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v6bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v6bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v6bf16:
; GFX10: ; %bb.0:
@@ -38651,16 +41458,28 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_select_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v8bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v8bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v8bf16:
; GFX10: ; %bb.0:
@@ -38900,20 +41719,36 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_select_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v16bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v16bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v16bf16:
; GFX10: ; %bb.0:
@@ -39469,32 +42304,60 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_select_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v32bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc
+; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32
+; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc
+; GFX900-NEXT: s_waitcnt vmcnt(1)
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v32bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_load_dword v31, off, s32
+; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc
+; GFX950-NEXT: s_waitcnt vmcnt(1)
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v31, v15, vcc
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v32bf16:
; GFX10: ; %bb.0:
@@ -39604,19 +42467,34 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: ; return to shader part epilog
;
-; GFX9-LABEL: s_select_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NEXT: ; return to shader part epilog
+; GFX900-LABEL: s_select_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-NEXT: v_mov_b32_e32 v2, s0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: v_mov_b32_e32 v2, s1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX900-NEXT: v_readfirstlane_b32 s0, v0
+; GFX900-NEXT: v_readfirstlane_b32 s1, v1
+; GFX900-NEXT: ; return to shader part epilog
+;
+; GFX950-LABEL: s_select_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: v_mov_b32_e32 v1, s2
+; GFX950-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX950-NEXT: v_mov_b32_e32 v1, s3
+; GFX950-NEXT: v_mov_b32_e32 v2, s1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX950-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: v_readfirstlane_b32 s1, v1
+; GFX950-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_select_v3bf16:
; GFX10: ; %bb.0:
@@ -39720,18 +42598,32 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: ; return to shader part epilog
;
-; GFX9-LABEL: s_select_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: v_readfirstlane_b32 s1, v0
-; GFX9-NEXT: ; return to shader part epilog
+; GFX900-LABEL: s_select_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: v_mov_b32_e32 v2, s1
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-NEXT: v_mov_b32_e32 v2, s0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX900-NEXT: v_readfirstlane_b32 s0, v1
+; GFX900-NEXT: v_readfirstlane_b32 s1, v0
+; GFX900-NEXT: ; return to shader part epilog
+;
+; GFX950-LABEL: s_select_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: v_mov_b32_e32 v1, s3
+; GFX950-NEXT: v_mov_b32_e32 v2, s1
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX950-NEXT: v_mov_b32_e32 v1, s2
+; GFX950-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX950-NEXT: v_readfirstlane_b32 s1, v0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v1
+; GFX950-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_select_v4bf16:
; GFX10: ; %bb.0:
@@ -39854,34 +42746,66 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
; GFX8-NEXT: v_readfirstlane_b32 s1, v2
; GFX8-NEXT: ; return to shader part epilog
;
-; GFX9-LABEL: s_vselect_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s4, s1, 16
-; GFX9-NEXT: s_lshr_b32 s5, s3, 16
-; GFX9-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT: s_mov_b32 s1, 0x5040100
-; GFX9-NEXT: s_lshr_b32 s3, s0, 16
-; GFX9-NEXT: s_lshr_b32 s4, s2, 16
-; GFX9-NEXT: v_perm_b32 v2, v3, v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s1
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v2
-; GFX9-NEXT: ; return to shader part epilog
+; GFX900-LABEL: s_vselect_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_lshr_b32 s4, s1, 16
+; GFX900-NEXT: s_lshr_b32 s5, s3, 16
+; GFX900-NEXT: v_mov_b32_e32 v4, s5
+; GFX900-NEXT: v_mov_b32_e32 v5, s4
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_mov_b32_e32 v4, s3
+; GFX900-NEXT: v_mov_b32_e32 v5, s1
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX900-NEXT: s_mov_b32 s1, 0x5040100
+; GFX900-NEXT: s_lshr_b32 s3, s0, 16
+; GFX900-NEXT: s_lshr_b32 s4, s2, 16
+; GFX900-NEXT: v_perm_b32 v2, v3, v2, s1
+; GFX900-NEXT: v_mov_b32_e32 v3, s4
+; GFX900-NEXT: v_mov_b32_e32 v4, s3
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT: v_mov_b32_e32 v3, s2
+; GFX900-NEXT: v_mov_b32_e32 v4, s0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX900-NEXT: v_perm_b32 v0, v1, v0, s1
+; GFX900-NEXT: v_readfirstlane_b32 s0, v0
+; GFX900-NEXT: v_readfirstlane_b32 s1, v2
+; GFX900-NEXT: ; return to shader part epilog
+;
+; GFX950-LABEL: s_vselect_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_lshr_b32 s4, s1, 16
+; GFX950-NEXT: s_lshr_b32 s5, s3, 16
+; GFX950-NEXT: v_mov_b32_e32 v4, s5
+; GFX950-NEXT: v_mov_b32_e32 v5, s4
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX950-NEXT: s_lshr_b32 s4, s2, 16
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX950-NEXT: v_mov_b32_e32 v4, s3
+; GFX950-NEXT: v_mov_b32_e32 v5, s1
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX950-NEXT: s_mov_b32 s1, 0x5040100
+; GFX950-NEXT: s_lshr_b32 s3, s0, 16
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX950-NEXT: v_perm_b32 v2, v3, v2, s1
+; GFX950-NEXT: v_mov_b32_e32 v3, s4
+; GFX950-NEXT: v_mov_b32_e32 v4, s3
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX950-NEXT: v_mov_b32_e32 v3, s2
+; GFX950-NEXT: v_mov_b32_e32 v4, s0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX950-NEXT: v_perm_b32 v0, v1, v0, s1
+; GFX950-NEXT: v_readfirstlane_b32 s1, v2
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_vselect_v4bf16:
; GFX10: ; %bb.0:
@@ -40053,26 +42977,48 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_vselect_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
-; GFX9-NEXT: v_and_b32_e32 v1, 1, v3
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT: v_and_b32_e32 v1, 1, v2
-; GFX9-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc
-; GFX9-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4
-; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_vselect_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v3
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v2
+; GFX900-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX900-NEXT: s_mov_b64 vcc, s[4:5]
+; GFX900-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7]
+; GFX900-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_vselect_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
+; GFX950-NEXT: v_and_b32_e32 v1, 1, v3
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX950-NEXT: v_and_b32_e32 v1, 1, v2
+; GFX950-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v1
+; GFX950-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[2:3]
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX950-NEXT: s_mov_b64 vcc, s[0:1]
+; GFX950-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v0, v3, v0, s0
+; GFX950-NEXT: v_perm_b32 v1, v2, v1, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v4bf16:
; GFX10: ; %bb.0:
@@ -40294,47 +43240,93 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_vselect_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v14
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v12
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
-; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4
-; GFX9-NEXT: v_perm_b32 v3, v7, v6, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_vselect_v8bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX900-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX900-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX900-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX900-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v14
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX900-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v13
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v12
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4
+; GFX900-NEXT: v_perm_b32 v2, v5, v4, s4
+; GFX900-NEXT: v_perm_b32 v3, v7, v6, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_vselect_v8bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX950-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX950-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX950-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX950-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX950-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v14
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX950-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v13
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v12
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0
+; GFX950-NEXT: v_perm_b32 v1, v3, v2, s0
+; GFX950-NEXT: v_perm_b32 v2, v5, v4, s0
+; GFX950-NEXT: v_perm_b32 v3, v7, v6, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v8bf16:
; GFX10: ; %bb.0:
@@ -40803,85 +43795,171 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX8-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_vselect_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT: v_and_b32_e32 v6, 1, v8
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6
-; GFX9-NEXT: v_and_b32_e32 v6, 1, v10
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v6
-; GFX9-NEXT: v_and_b32_e32 v6, 1, v12
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v6
-; GFX9-NEXT: v_and_b32_e32 v8, 1, v13
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v30, v22, s[8:9]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v8
-; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v30
-; GFX9-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX9-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX9-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT: v_and_b32_e32 v13, 1, v14
-; GFX9-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[8:9]
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v21
-; GFX9-NEXT: v_cndmask_b32_e64 v14, v29, v21, s[6:7]
-; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v29
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v11
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v20
-; GFX9-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[4:5]
-; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v19
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v27
-; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[6:7]
-; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v28
-; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v22, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5]
-; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v21, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23
-; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v6, v10, v6, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v8, v23, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
-; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4
-; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4
-; GFX9-NEXT: v_perm_b32 v4, v11, v20, s4
-; GFX9-NEXT: v_perm_b32 v5, v12, v14, s4
-; GFX9-NEXT: v_perm_b32 v7, v7, v13, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_vselect_v16bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX900-NEXT: v_and_b32_e32 v6, 1, v8
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6
+; GFX900-NEXT: v_and_b32_e32 v6, 1, v10
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v6
+; GFX900-NEXT: v_and_b32_e32 v6, 1, v12
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v6
+; GFX900-NEXT: v_and_b32_e32 v8, 1, v13
+; GFX900-NEXT: v_cndmask_b32_e64 v6, v30, v22, s[8:9]
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v8
+; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v22
+; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v30
+; GFX900-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX900-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX900-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX900-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX900-NEXT: v_and_b32_e32 v13, 1, v14
+; GFX900-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[8:9]
+; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v21
+; GFX900-NEXT: v_cndmask_b32_e64 v14, v29, v21, s[6:7]
+; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v29
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v11
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v20
+; GFX900-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[4:5]
+; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v19
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v27
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX900-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[6:7]
+; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v28
+; GFX900-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v22, vcc
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX900-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5]
+; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX900-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v27, v21, vcc
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v23
+; GFX900-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX900-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v6, v10, v6, s4
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v8, v23, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v17
+; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v25
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v16
+; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v24
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc
+; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4
+; GFX900-NEXT: v_perm_b32 v2, v5, v4, s4
+; GFX900-NEXT: v_perm_b32 v3, v9, v19, s4
+; GFX900-NEXT: v_perm_b32 v4, v11, v20, s4
+; GFX900-NEXT: v_perm_b32 v5, v12, v14, s4
+; GFX900-NEXT: v_perm_b32 v7, v7, v13, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_vselect_v16bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_load_dword v31, off, s32
+; GFX950-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX950-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX950-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX950-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX950-NEXT: v_lshrrev_b32_e32 v34, 16, v30
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
+; GFX950-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX950-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX950-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v29
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
+; GFX950-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX950-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v20
+; GFX950-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX950-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX950-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX950-NEXT: v_lshrrev_b32_e32 v39, 16, v19
+; GFX950-NEXT: v_lshrrev_b32_e32 v48, 16, v27
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX950-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX950-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX950-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX950-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
+; GFX950-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX950-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX950-NEXT: v_lshrrev_b32_e32 v50, 16, v26
+; GFX950-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v18, 16, v31
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v18, v32, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX950-NEXT: v_lshrrev_b32_e32 v18, 16, v25
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v18, v17, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: v_lshrrev_b32_e32 v17, 16, v24
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc
+; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0
+; GFX950-NEXT: v_perm_b32 v1, v3, v2, s0
+; GFX950-NEXT: v_perm_b32 v2, v5, v4, s0
+; GFX950-NEXT: v_perm_b32 v3, v7, v6, s0
+; GFX950-NEXT: v_perm_b32 v4, v9, v8, s0
+; GFX950-NEXT: v_perm_b32 v5, v11, v10, s0
+; GFX950-NEXT: v_perm_b32 v6, v13, v12, s0
+; GFX950-NEXT: v_perm_b32 v7, v15, v14, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v16bf16:
; GFX10: ; %bb.0:
@@ -41981,205 +45059,438 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_vselect_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v3
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v5
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v4
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v7
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v6
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v9
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v8
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v11
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v10
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v13
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v12
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v15
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v14
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v17
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v16
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v19
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v18
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v21
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v20
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v23
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v22
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v25
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v24
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v27
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v26
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v29
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v28
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32
-; GFX9-NEXT: v_writelane_b32 v33, s30, 0
-; GFX9-NEXT: v_writelane_b32 v33, s31, 1
-; GFX9-NEXT: v_writelane_b32 v33, s34, 2
-; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT: v_writelane_b32 v33, s35, 3
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v30
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
-; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80
-; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
-; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
-; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
-; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
-; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
-; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120
-; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
-; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124
-; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v30, v31, v32, s[34:35]
-; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31]
-; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v29, s[94:95]
-; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v29, s[92:93]
-; GFX9-NEXT: v_cndmask_b32_e64 v29, v26, v27, s[90:91]
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89]
-; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79]
-; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77]
-; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75]
-; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73]
-; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63]
-; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61]
-; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59]
-; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57]
-; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47]
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45]
-; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43]
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41]
-; GFX9-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29]
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27]
-; GFX9-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25]
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23]
-; GFX9-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21]
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19]
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17]
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15]
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13]
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9]
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5]
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_perm_b32 v1, v2, v5, s4
-; GFX9-NEXT: v_perm_b32 v2, v4, v7, s4
-; GFX9-NEXT: v_perm_b32 v3, v6, v9, s4
-; GFX9-NEXT: v_perm_b32 v4, v8, v11, s4
-; GFX9-NEXT: v_perm_b32 v5, v10, v13, s4
-; GFX9-NEXT: v_perm_b32 v6, v12, v15, s4
-; GFX9-NEXT: v_perm_b32 v7, v14, v17, s4
-; GFX9-NEXT: v_perm_b32 v8, v16, v19, s4
-; GFX9-NEXT: v_perm_b32 v9, v18, v21, s4
-; GFX9-NEXT: v_perm_b32 v10, v20, v23, s4
-; GFX9-NEXT: v_perm_b32 v11, v22, v25, s4
-; GFX9-NEXT: v_perm_b32 v12, v24, v27, s4
-; GFX9-NEXT: v_perm_b32 v13, v26, v29, s4
-; GFX9-NEXT: v_perm_b32 v14, v28, v32, s4
-; GFX9-NEXT: v_perm_b32 v15, v31, v30, s4
-; GFX9-NEXT: v_readlane_b32 s35, v33, 3
-; GFX9-NEXT: v_readlane_b32 s34, v33, 2
-; GFX9-NEXT: v_readlane_b32 s31, v33, 1
-; GFX9-NEXT: v_readlane_b32 s30, v33, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_vselect_v32bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v3
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v2
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v5
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v4
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v7
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v6
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v9
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v8
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v11
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v10
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v13
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v12
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v15
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v14
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v17
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v16
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v19
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v18
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v21
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v20
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v23
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v22
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v25
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v24
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v27
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v26
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v29
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v28
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0
+; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32
+; GFX900-NEXT: v_writelane_b32 v33, s30, 0
+; GFX900-NEXT: v_writelane_b32 v33, s31, 1
+; GFX900-NEXT: v_writelane_b32 v33, s34, 2
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX900-NEXT: v_writelane_b32 v33, s35, 3
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 1, v30
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
+; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68
+; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72
+; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GFX900-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76
+; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
+; GFX900-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80
+; GFX900-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
+; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84
+; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
+; GFX900-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
+; GFX900-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
+; GFX900-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92
+; GFX900-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
+; GFX900-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96
+; GFX900-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
+; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100
+; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
+; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104
+; GFX900-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
+; GFX900-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
+; GFX900-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GFX900-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112
+; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48
+; GFX900-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
+; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
+; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120
+; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
+; GFX900-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124
+; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60
+; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_cndmask_b32_e64 v30, v31, v32, s[34:35]
+; GFX900-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX900-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31]
+; GFX900-NEXT: v_cndmask_b32_e64 v32, v28, v29, s[94:95]
+; GFX900-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX900-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, v29, s[92:93]
+; GFX900-NEXT: v_cndmask_b32_e64 v29, v26, v27, s[90:91]
+; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX900-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89]
+; GFX900-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79]
+; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX900-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77]
+; GFX900-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75]
+; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX900-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73]
+; GFX900-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63]
+; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX900-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61]
+; GFX900-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59]
+; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX900-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57]
+; GFX900-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47]
+; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX900-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45]
+; GFX900-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43]
+; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX900-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41]
+; GFX900-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29]
+; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX900-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27]
+; GFX900-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25]
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX900-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23]
+; GFX900-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21]
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX900-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19]
+; GFX900-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17]
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15]
+; GFX900-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13]
+; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX900-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
+; GFX900-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9]
+; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7]
+; GFX900-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5]
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_perm_b32 v1, v2, v5, s4
+; GFX900-NEXT: v_perm_b32 v2, v4, v7, s4
+; GFX900-NEXT: v_perm_b32 v3, v6, v9, s4
+; GFX900-NEXT: v_perm_b32 v4, v8, v11, s4
+; GFX900-NEXT: v_perm_b32 v5, v10, v13, s4
+; GFX900-NEXT: v_perm_b32 v6, v12, v15, s4
+; GFX900-NEXT: v_perm_b32 v7, v14, v17, s4
+; GFX900-NEXT: v_perm_b32 v8, v16, v19, s4
+; GFX900-NEXT: v_perm_b32 v9, v18, v21, s4
+; GFX900-NEXT: v_perm_b32 v10, v20, v23, s4
+; GFX900-NEXT: v_perm_b32 v11, v22, v25, s4
+; GFX900-NEXT: v_perm_b32 v12, v24, v27, s4
+; GFX900-NEXT: v_perm_b32 v13, v26, v29, s4
+; GFX900-NEXT: v_perm_b32 v14, v28, v32, s4
+; GFX900-NEXT: v_perm_b32 v15, v31, v30, s4
+; GFX900-NEXT: v_readlane_b32 s35, v33, 3
+; GFX900-NEXT: v_readlane_b32 s34, v33, 2
+; GFX900-NEXT: v_readlane_b32 s31, v33, 1
+; GFX900-NEXT: v_readlane_b32 s30, v33, 0
+; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_vselect_v32bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX950-NEXT: scratch_load_dword v31, off, s32 offset:60
+; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:124
+; GFX950-NEXT: scratch_load_ushort v33, off, s32
+; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:64
+; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:128
+; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:120
+; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:56
+; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:116
+; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:52
+; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:112
+; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:48
+; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:88
+; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:24
+; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:92
+; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:28
+; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:108
+; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:44
+; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:96
+; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:32
+; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:100
+; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:36
+; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:104
+; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:40
+; GFX950-NEXT: v_and_b32_e32 v29, 1, v29
+; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v29
+; GFX950-NEXT: scratch_load_dword v29, off, s32 offset:84
+; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:20
+; GFX950-NEXT: v_and_b32_e32 v28, 1, v28
+; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v28
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX950-NEXT: v_and_b32_e32 v26, 1, v26
+; GFX950-NEXT: v_and_b32_e32 v27, 1, v27
+; GFX950-NEXT: v_and_b32_e32 v24, 1, v24
+; GFX950-NEXT: v_and_b32_e32 v25, 1, v25
+; GFX950-NEXT: v_and_b32_e32 v22, 1, v22
+; GFX950-NEXT: v_and_b32_e32 v23, 1, v23
+; GFX950-NEXT: v_and_b32_e32 v20, 1, v20
+; GFX950-NEXT: v_and_b32_e32 v21, 1, v21
+; GFX950-NEXT: v_and_b32_e32 v18, 1, v18
+; GFX950-NEXT: v_and_b32_e32 v19, 1, v19
+; GFX950-NEXT: v_and_b32_e32 v16, 1, v16
+; GFX950-NEXT: v_and_b32_e32 v17, 1, v17
+; GFX950-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX950-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX950-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX950-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX950-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX950-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX950-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX950-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX950-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX950-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX950-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX950-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX950-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX950-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX950-NEXT: s_waitcnt vmcnt(24)
+; GFX950-NEXT: v_lshrrev_b32_e32 v46, 16, v31
+; GFX950-NEXT: s_waitcnt vmcnt(23)
+; GFX950-NEXT: v_lshrrev_b32_e32 v47, 16, v32
+; GFX950-NEXT: s_waitcnt vmcnt(22)
+; GFX950-NEXT: v_and_b32_e32 v28, 1, v33
+; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:80
+; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:16
+; GFX950-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v28
+; GFX950-NEXT: v_and_b32_e32 v28, 1, v30
+; GFX950-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v28
+; GFX950-NEXT: scratch_load_dword v28, off, s32 offset:76
+; GFX950-NEXT: scratch_load_dword v30, off, s32 offset:12
+; GFX950-NEXT: s_waitcnt vmcnt(25)
+; GFX950-NEXT: v_lshrrev_b32_e32 v58, 16, v34
+; GFX950-NEXT: s_waitcnt vmcnt(24)
+; GFX950-NEXT: v_lshrrev_b32_e32 v59, 16, v35
+; GFX950-NEXT: v_cndmask_b32_e64 v34, v35, v34, s[4:5]
+; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:72
+; GFX950-NEXT: v_cndmask_b32_e64 v58, v59, v58, s[2:3]
+; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:8
+; GFX950-NEXT: v_cndmask_b32_e64 v31, v32, v31, s[0:1]
+; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:68
+; GFX950-NEXT: v_cndmask_b32_e32 v46, v47, v46, vcc
+; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:4
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_waitcnt vmcnt(26)
+; GFX950-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v27
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24
+; GFX950-NEXT: s_waitcnt vmcnt(24)
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v39
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v38
+; GFX950-NEXT: v_cndmask_b32_e32 v24, v38, v39, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v25
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v25, v37, v36, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22
+; GFX950-NEXT: s_waitcnt vmcnt(22)
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v49
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v48
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v37, v36, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20
+; GFX950-NEXT: s_waitcnt vmcnt(16)
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v55
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v54
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v54, v55, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v37, v36, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18
+; GFX950-NEXT: s_waitcnt vmcnt(10)
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v45
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v44
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v44, v45, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
+; GFX950-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v37, v36, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v43
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v42
+; GFX950-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX950-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v37, v36, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v41
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v40
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v40, v41, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v37, v36, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v53
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v52
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v52, v53, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v37, v36, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v51
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v50
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v50, v51, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v37, v36, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX950-NEXT: s_waitcnt vmcnt(8)
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v56
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v29, v56, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX950-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v29, v36, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX950-NEXT: s_waitcnt vmcnt(6)
+; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v57
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v33, v57, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX950-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v33, v29, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX950-NEXT: s_waitcnt vmcnt(4)
+; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v30
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v28, v30, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v28, v29, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX950-NEXT: s_waitcnt vmcnt(2)
+; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v59
+; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v35
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v35, v59, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v29, v28, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v47
+; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v32
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v32, v47, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX950-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v29, v28, vcc
+; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0
+; GFX950-NEXT: v_perm_b32 v1, v3, v2, s0
+; GFX950-NEXT: v_perm_b32 v2, v5, v4, s0
+; GFX950-NEXT: v_perm_b32 v3, v7, v6, s0
+; GFX950-NEXT: v_perm_b32 v4, v9, v8, s0
+; GFX950-NEXT: v_perm_b32 v5, v11, v10, s0
+; GFX950-NEXT: v_perm_b32 v6, v13, v12, s0
+; GFX950-NEXT: v_perm_b32 v7, v15, v14, s0
+; GFX950-NEXT: v_perm_b32 v8, v17, v16, s0
+; GFX950-NEXT: v_perm_b32 v9, v19, v18, s0
+; GFX950-NEXT: v_perm_b32 v10, v21, v20, s0
+; GFX950-NEXT: v_perm_b32 v11, v23, v22, s0
+; GFX950-NEXT: v_perm_b32 v12, v25, v24, s0
+; GFX950-NEXT: v_perm_b32 v13, v27, v26, s0
+; GFX950-NEXT: v_perm_b32 v14, v46, v31, s0
+; GFX950-NEXT: v_perm_b32 v15, v58, v34, s0
+; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v32bf16:
; GFX10: ; %bb.0:
@@ -42769,21 +46080,31 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fma_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fma_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fma_bf16:
; GFX10: ; %bb.0:
@@ -42912,31 +46233,45 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fma_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT: v_fma_f32 v3, v5, v4, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fma_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_fma_f32 v3, v5, v4, v3
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fma_v2bf16:
; GFX10: ; %bb.0:
@@ -43118,41 +46453,60 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fma_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v0
-; GFX9-NEXT: v_fma_f32 v3, v6, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4
-; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fma_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_fma_f32 v1, v1, v3, v5
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX900-NEXT: v_fma_f32 v3, v6, v5, v3
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_fma_f32 v0, v0, v2, v4
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, s0
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_fmac_f32_e32 v3, v6, v5
+; GFX950-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v4, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fma_v3bf16:
; GFX10: ; %bb.0:
@@ -43394,50 +46748,73 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fma_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT: v_fma_f32 v6, v8, v7, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5
-; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4
-; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fma_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX900-NEXT: v_fma_f32 v6, v8, v7, v6
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_fma_f32 v1, v1, v3, v5
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX900-NEXT: v_fma_f32 v3, v7, v5, v3
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_fma_f32 v0, v0, v2, v4
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX950-NEXT: v_fmac_f32_e32 v1, v7, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_fmac_f32_e32 v3, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v3, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, v6
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fma_v4bf16:
; GFX10: ; %bb.0:
@@ -43640,28 +47017,41 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmuladd_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmuladd_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmuladd_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_bf16:
; GFX10: ; %bb.0:
@@ -43839,45 +47229,65 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmuladd_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmuladd_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmuladd_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX950-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_v2bf16:
; GFX10: ; %bb.0:
@@ -44145,62 +47555,90 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmuladd_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmuladd_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX900-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmuladd_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX950-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX950-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX950-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_v3bf16:
; GFX10: ; %bb.0:
@@ -44560,78 +47998,113 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmuladd_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1
-; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5
-; GFX9-NEXT: v_add_f32_e32 v6, v6, v7
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
-; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmuladd_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX900-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT: v_add_f32_e32 v6, v6, v7
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX900-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmuladd_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX950-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; GFX950-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX950-NEXT: v_add_f32_e32 v6, v6, v7
+; GFX950-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX950-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_v4bf16:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index bc81756..c14678c 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
; SI-LABEL: v_ubfe_sub_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index b372dec..628301b8 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8-GISEL %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=GFX8-GISEL %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck -enable-var-scope -check-prefix=GFX10-GISEL %s
; BFI_INT Definition pattern from ISA docs
; (y & x) | (z & ~x)
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
index 3d52c15..bd76f34 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
define float @v_bfi_single_nesting_level(float %x, float %y, float %z) {
; GCN-LABEL: v_bfi_single_nesting_level:
diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll
index d287d00..a12b5ea 100644
--- a/llvm/test/CodeGen/AMDGPU/bfm.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfm.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=VI %s
define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) #0 {
; SI-LABEL: s_bfm_pattern:
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll b/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
index 6a48aee..9323800 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}cast_constant_i64_to_build_vector_v4i16:
; GCN: global_store_short
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-v4f16-v4i16.ll b/llvm/test/CodeGen/AMDGPU/bitcast-v4f16-v4i16.ll
index 58f062b..57393a4 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast-v4f16-v4i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-v4f16-v4i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope %s
; creating v4i16->v4f16 and v4f16->v4i16 bitcasts in the selection DAG is rather
; difficult, so this test has to throw in some llvm.amdgcn.wqm to get them
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
index ca33993..913dc3c 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
; The bitcast should be pushed through the bitcasts so the vectors can
; be broken down and the shared components can be CSEd
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
index de2e256..58a4a22 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
; Test that materialization constants that are the bit reversed of
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index ab078be..d4f5617 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GISEL
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11-FLAT,GFX11-FLAT-TRUE16
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11-FLAT,GFX11-FLAT-FAKE16
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -global-isel -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11-GISEL,GFX11-GISEL-TRUE16
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -global-isel -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11-GISEL,GFX11-GISEL-FAKE16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefix=FLAT
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global | FileCheck %s --check-prefix=FLAT
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -global-isel | FileCheck %s --check-prefix=GISEL
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global | FileCheck %s --check-prefixes=GFX11-FLAT,GFX11-FLAT-TRUE16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global | FileCheck %s --check-prefixes=GFX11-FLAT,GFX11-FLAT-FAKE16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -global-isel | FileCheck %s --check-prefixes=GFX11-GISEL,GFX11-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -global-isel | FileCheck %s --check-prefixes=GFX11-GISEL,GFX11-GISEL-FAKE16
declare i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index b27ad26..2761cba 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @br_cc_f16(
; SI-LABEL: br_cc_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 7eb7d72..006fe51 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -766,10 +766,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec
; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr46, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr47, killed $vgpr10, 1, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_opsel_e64 0, killed $sgpr47, 0, killed $vgpr10, 0, 1, 0, 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr17, 0, $vgpr16, 0, 1, 0, 0, implicit $exec
; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr15, 0, $vgpr14, 0, 1, 0, 0, implicit $exec
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll
index 08f19a5..0f8275c 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=5 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -amdgpu-s-branch-bits=5 < %s | FileCheck -check-prefix=GCN %s
; Restrict maximum branch to between +15 and -16 dwords
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
index 253e7e2..0e5ef3c 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
@@ -68,7 +68,7 @@ body: |
; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000)
; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VRegOrLds_32 */, def renamable $sgpr4
+ ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4
; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc
; CHECK-NEXT: {{ $}}
@@ -149,7 +149,7 @@ body: |
successors: %bb.3(0x04000000), %bb.2(0x7c000000)
liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
- INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:SReg_32 */, def renamable $sgpr4
+ INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4
S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
S_CBRANCH_SCC1 %bb.2, implicit killed $scc
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
index 474ba71..a25c52f 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
@@ -69,7 +69,7 @@ body: |
; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000)
; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VRegOrLds_32 */, def renamable $sgpr4
+ ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4
; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc
; CHECK-NEXT: {{ $}}
@@ -151,7 +151,7 @@ body: |
successors: %bb.3(0x04000000), %bb.2(0x7c000000)
liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
- INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:SReg_32 */, def renamable $sgpr4
+ INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4
S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
S_CBRANCH_SCC1 %bb.2, implicit killed $scc
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index 83ab6c3..ab2ad19 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 -o - %s | FileCheck %s
define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 {
; CHECK-LABEL: spill:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
index 903bc85..722dff0 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s
; For gfx1010, overestimate the branch size in case we need to insert
; a nop for the buggy offset.
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index d103423..2ad7818 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -145,13 +145,14 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scale_offset scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: s_wait_xcnt 0x0
-; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GCN-NEXT: s_mov_b32 s0, exec_lo
; GCN-NEXT: v_cmpx_ne_u32_e32 0, v2
; GCN-NEXT: s_cbranch_execnz .LBB3_1
@@ -167,7 +168,6 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
; GCN-NEXT: s_sleep 0
; GCN-NEXT: s_sleep 0
; GCN-NEXT: .LBB3_2: ; %bb3
-; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GCN-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_SYS
; GCN-NEXT: s_wait_storecnt 0x0
@@ -588,7 +588,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
-; GCN-NEXT: s_wait_alu 0xfffe
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GCN-NEXT: global_store_b32 v1, v0, s[0:1]
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
index 8d07614..eaba9d5 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-s-branch-bits=4 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-s-branch-bits=4 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-s-branch-bits=4 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
; Make sure the code size estimate for inline asm is 12-bytes per
; instruction, rather than 8 in previous generations.
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index b03ade4..5959f76 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -amdgpu-s-branch-bits=5 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-s-branch-bits=5 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-s-branch-bits=5 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-s-branch-bits=5 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s
; FIXME: We should use llvm-mc for this, but we can't even parse our own output.
; See PR33579.
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -amdgpu-long-branch-factor=0 -o %t.o -filetype=obj -simplifycfg-require-and-preserve-domtree=1 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-s-branch-bits=4 -amdgpu-long-branch-factor=0 -o %t.o -filetype=obj -simplifycfg-require-and-preserve-domtree=1 %s
; RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=OBJ %s
; OBJ: Relocations [
diff --git a/llvm/test/CodeGen/AMDGPU/branch-uniformity.ll b/llvm/test/CodeGen/AMDGPU/branch-uniformity.ll
index 00938ce..5a352e4 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-uniformity.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-uniformity.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
; The branch instruction in LOOP49 has a uniform condition, but PHI instructions
; introduced by the structurizecfg pass previously caused a false divergence
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index 4787f21..7c48544 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn-- | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
declare i16 @llvm.bswap.i16(i16) nounwind readnone
declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
index a141143..b08e9c4 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
index eb452dc..b80aa93 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
index 37928a7..96b191d 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
index 790cd8e..c30b554 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll
index 89e1a4b..2abd7ed 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
index 384beae..9189f4f 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=None -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=None -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -amdgpu-atomic-optimizer-strategy=None -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s
define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) dereferenceable(18446744073709551615) %arg0, i32 %arg1) {
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll
index 84a4b57..96b71cf 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=SDAG %s
define amdgpu_kernel void @buffer_ptr_vector_ops(ptr addrspace(1) %somewhere) {
; GISEL-LABEL: buffer_ptr_vector_ops:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll b/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll
index 7278639..500cc7e 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s
; The buffer_loads and buffer_stores all access the same location. Check they do
; not get reordered by the scheduler.
diff --git a/llvm/test/CodeGen/AMDGPU/bug-deadlanes.ll b/llvm/test/CodeGen/AMDGPU/bug-deadlanes.ll
index 95f97ad..8d9c1b6 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-deadlanes.ll
+++ b/llvm/test/CodeGen/AMDGPU/bug-deadlanes.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-codegenprepare-break-large-phis=false < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-codegenprepare-break-large-phis=false < %s | FileCheck %s
; CHECK-LABEL: {{^}}_amdgpu_ps_main:
;
diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll
index a76390b..93275d0 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll
+++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
; This used to cause a circular chain dependency during
; SelectionDAG instruction scheduling.
diff --git a/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll b/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll
index 162b88d..3126491 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=amdgpu-isel -stop-after=amdgpu-isel -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=amdgpu-isel -stop-after=amdgpu-isel -enable-new-pm -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=amdgpu-isel -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=amdgpu-isel -stop-after=amdgpu-isel -enable-new-pm | FileCheck %s --check-prefixes=CHECK
; This caused failure in infinite cycle in Selection DAG (combine) due to missing insert_subvector.
;
diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll b/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
index f70b3fd..8f3e905 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; There was an infinite loop in DAGCombiner from a target build_vector
; combine and a generic insert_vector_elt combine.
diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
index 5c7172f..bdb52db 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
define void @undef_lo_v2i16(i16 %arg0) {
; GFX9-LABEL: undef_lo_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
index 1f0e093..1cc6209 100644
--- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
%struct.ByValStruct = type { [4 x i32] }
; Make sure the offset is folded and function's frame register is used
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 9f47735..2a1be99 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=HSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=HSA %s
declare hidden void @external_void_func_i1(i1) #0
declare hidden void @external_void_func_i1_signext(i1 signext) #0
diff --git a/llvm/test/CodeGen/AMDGPU/call-c-function.ll b/llvm/test/CodeGen/AMDGPU/call-c-function.ll
index ba52577..e1bb3ea 100644
--- a/llvm/test/CodeGen/AMDGPU/call-c-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-c-function.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel=0 -stop-after=finalize-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel=0 -stop-after=finalize-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
; Test that we don't explode on calls from shaders to functions with the C calling convention.
diff --git a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll
index 3b1fd80..5f324df 100644
--- a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_bitcast_return_type_noinline:
; GCN: s_getpc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/call-encoding.ll b/llvm/test/CodeGen/AMDGPU/call-encoding.ll
index 8b61e4d..6954c34 100644
--- a/llvm/test/CodeGen/AMDGPU/call-encoding.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-encoding.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -filetype=obj -verify-machineinstrs < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=fiji -d - | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -verify-machineinstrs < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx900 -d - | FileCheck --check-prefix=GCN %s
-; XUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -filetype=obj -verify-machineinstrs < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=hawaii -d - | FileCheck --check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=fiji -d - | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx900 -d - | FileCheck --check-prefix=GCN %s
+; XUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=hawaii -d - | FileCheck --check-prefixes=GCN,CI %s
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index dbd00f0..4df1049 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -1,8 +1,8 @@
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 -verify-machineinstrs | FileCheck -check-prefixes=GCN,CI %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 -verify-machineinstrs | FileCheck -check-prefixes=GCN-V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 -verify-machineinstrs | FileCheck -check-prefixes=GCN-V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 -verify-machineinstrs | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN,CI %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
; Make sure to run a GPU with the SGPR allocation bug.
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index 44be28f..69ad8e9 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
declare hidden void @external_void_func_void() #3
diff --git a/llvm/test/CodeGen/AMDGPU/call-return-types.ll b/llvm/test/CodeGen/AMDGPU/call-return-types.ll
index bf99648..c0f74fd 100644
--- a/llvm/test/CodeGen/AMDGPU/call-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-return-types.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX11 %s
declare void @external_void_func_void() #0
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 9561aa5..e7254eb 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,MUBUF %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,MUBUF %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FLATSCR %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,MUBUF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,MUBUF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-flat-scratch < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FLATSCR %s
define void @callee_no_stack() #0 {
; GCN-LABEL: callee_no_stack:
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index da49140..ff80250 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
; Make sure we don't crash or assert on spir_kernel calling convention.
diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
index 963b3a5..32023a7 100644
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; TODO: Test with flat scratch
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index d0ae30f..b71885b 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -2,13 +2,14 @@
; RUN: llc -mtriple=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN-ISEL %s
; RUN: llc -mtriple=amdgcn -stop-after=amdgpu-isel -enable-new-pm < %s | FileCheck -enable-var-scope -check-prefixes=GCN-ISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CISI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1010 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=CISI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1010 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250 %s
; GCN-ISEL-LABEL: name: sadd64rr
; GCN-ISEL-LABEL: body:
@@ -113,6 +114,19 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: sadd64rr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%add = add i64 %a, %b
store i64 %add, ptr addrspace(1) %out
@@ -211,6 +225,17 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: sadd64ri:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], lit64(0x123456789876)
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%add = add i64 20015998343286, %a
store i64 %add, ptr addrspace(1) %out
@@ -301,6 +326,17 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: vadd64rr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
+; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -391,6 +427,17 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: vadd64ri:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], lit64(0x123456789876), v[0:1]
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -486,6 +533,18 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: suaddo32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
%carry = extractvalue { i32, i1 } %uadd, 1
@@ -606,6 +665,21 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: uaddo32_vcc_user:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_add_co_u32 v1, s4, s6, s7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX1250-NEXT: s_endpgm
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
%carry = extractvalue { i32, i1 } %uadd, 1
@@ -741,6 +815,21 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: suaddo64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX1250-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
%carry = extractvalue { i64, i1 } %uadd, 1
@@ -874,6 +963,23 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: vuaddo64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
+; GFX1250-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[2:3]
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -987,6 +1093,19 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ssub64rr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%sub = sub i64 %a, %b
store i64 %sub, ptr addrspace(1) %out
@@ -1085,6 +1204,17 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ssub64ri:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_sub_nc_u64 s[2:3], lit64(0x123456789876), s[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%sub = sub i64 20015998343286, %a
store i64 %sub, ptr addrspace(1) %out
@@ -1175,6 +1305,17 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: vsub64rr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[2:3], v[0:1]
+; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1265,6 +1406,17 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: vsub64ri:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], lit64(0x123456789876), v[0:1]
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1361,6 +1513,18 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: susubo32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_sub_co_i32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
@@ -1481,6 +1645,21 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: usubo32_vcc_user:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_sub_co_u32 v1, s4, s6, s7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX1250-NEXT: s_endpgm
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
@@ -1616,6 +1795,21 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: susubo64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_sub_nc_u64 s[6:7], s[4:5], s[6:7]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX1250-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %usub, 0
%carry = extractvalue { i64, i1 } %usub, 1
@@ -1749,6 +1943,23 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: vusubo64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[6:7], v[0:1]
+; GFX1250-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[2:3]
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -2904,6 +3115,164 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: .LBB16_4:
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-NEXT: s_branch .LBB16_2
+;
+; GFX1250-LABEL: sudiv64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_b64 s[0:1], s[0:1], lit64(0xffffffff00000000)
+; GFX1250-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4
+; GFX1250-NEXT: ; %bb.1:
+; GFX1250-NEXT: s_cvt_f32_u32 s0, s2
+; GFX1250-NEXT: s_cvt_f32_u32 s1, s3
+; GFX1250-NEXT: s_sub_nc_u64 s[6:7], 0, s[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-NEXT: s_fmac_f32 s0, s1, 0x4f800000
+; GFX1250-NEXT: v_s_rcp_f32 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-NEXT: s_mul_f32 s0, s0, 0x5f7ffffc
+; GFX1250-NEXT: s_mul_f32 s1, s0, 0x2f800000
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-NEXT: s_trunc_f32 s1, s1
+; GFX1250-NEXT: s_fmac_f32 s0, s1, 0xcf800000
+; GFX1250-NEXT: s_cvt_u32_f32 s5, s1
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-NEXT: s_cvt_u32_f32 s4, s0
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[6:7], s[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mul_hi_u32 s15, s4, s13
+; GFX1250-NEXT: s_mul_i32 s14, s4, s13
+; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s12
+; GFX1250-NEXT: s_mul_i32 s17, s5, s12
+; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[0:1], s[14:15]
+; GFX1250-NEXT: s_mul_hi_u32 s16, s5, s12
+; GFX1250-NEXT: s_mul_hi_u32 s18, s5, s13
+; GFX1250-NEXT: s_add_co_u32 s0, s14, s17
+; GFX1250-NEXT: s_add_co_ci_u32 s0, s15, s16
+; GFX1250-NEXT: s_mul_i32 s12, s5, s13
+; GFX1250-NEXT: s_add_co_ci_u32 s13, s18, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13]
+; GFX1250-NEXT: v_add_co_u32 v0, s0, s4, s12
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: s_add_co_ci_u32 s5, s5, s13
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v0
+; GFX1250-NEXT: s_mul_u64 s[6:7], s[6:7], s[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mul_hi_u32 s13, s4, s7
+; GFX1250-NEXT: s_mul_i32 s12, s4, s7
+; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s6
+; GFX1250-NEXT: s_mul_i32 s15, s5, s6
+; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13]
+; GFX1250-NEXT: s_mul_hi_u32 s14, s5, s6
+; GFX1250-NEXT: s_mul_hi_u32 s4, s5, s7
+; GFX1250-NEXT: s_add_co_u32 s0, s12, s15
+; GFX1250-NEXT: s_add_co_ci_u32 s0, s13, s14
+; GFX1250-NEXT: s_mul_i32 s6, s5, s7
+; GFX1250-NEXT: s_add_co_ci_u32 s7, s4, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[0:1], s[6:7]
+; GFX1250-NEXT: v_add_co_u32 v0, s0, v0, s6
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s7
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v0
+; GFX1250-NEXT: s_mul_hi_u32 s5, s10, s0
+; GFX1250-NEXT: s_mul_i32 s4, s10, s0
+; GFX1250-NEXT: s_mul_hi_u32 s12, s11, s0
+; GFX1250-NEXT: s_mul_i32 s6, s11, s0
+; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s7
+; GFX1250-NEXT: s_mul_i32 s13, s11, s7
+; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[4:5]
+; GFX1250-NEXT: s_mul_hi_u32 s0, s11, s7
+; GFX1250-NEXT: s_add_co_u32 s4, s4, s13
+; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s0
+; GFX1250-NEXT: s_add_co_ci_u32 s7, s12, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[6:7]
+; GFX1250-NEXT: s_and_b64 s[6:7], s[4:5], lit64(0xffffffff00000000)
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_or_b32 s6, s6, s4
+; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[6:7]
+; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[6:7], 2
+; GFX1250-NEXT: v_sub_co_u32 v0, s0, s10, s4
+; GFX1250-NEXT: s_sub_co_i32 s4, s11, s5
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX1250-NEXT: v_sub_co_u32 v1, s12, v0, s2
+; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, s3
+; GFX1250-NEXT: s_cmp_lg_u32 s12, 0
+; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], 1
+; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1
+; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_cmp_ge_u32 s4, s3
+; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s4, s3
+; GFX1250-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: v_cndmask_b32_e32 v1, s14, v1, vcc_lo
+; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0
+; GFX1250-NEXT: s_sub_co_ci_u32 s0, s11, s5
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_cmp_ge_u32 s0, s3
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s0, s3
+; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s4, v0, s0
+; GFX1250-NEXT: v_cndmask_b32_e32 v2, s12, v2, vcc_lo
+; GFX1250-NEXT: v_cndmask_b32_e32 v1, s13, v3, vcc_lo
+; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_cndmask_b32_e32 v1, s7, v1, vcc_lo
+; GFX1250-NEXT: v_cndmask_b32_e32 v0, s6, v2, vcc_lo
+; GFX1250-NEXT: s_cbranch_execnz .LBB16_3
+; GFX1250-NEXT: .LBB16_2:
+; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX1250-NEXT: s_sub_co_i32 s1, 0, s2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX1250-NEXT: v_nop
+; GFX1250-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: s_mul_i32 s1, s1, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mul_hi_u32 s1, s0, s1
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s0
+; GFX1250-NEXT: s_mul_i32 s1, s0, s2
+; GFX1250-NEXT: s_add_co_i32 s3, s0, 1
+; GFX1250-NEXT: s_sub_co_i32 s1, s10, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_sub_co_i32 s4, s1, s2
+; GFX1250-NEXT: s_cmp_ge_u32 s1, s2
+; GFX1250-NEXT: s_cselect_b32 s0, s3, s0
+; GFX1250-NEXT: s_cselect_b32 s1, s4, s1
+; GFX1250-NEXT: s_add_co_i32 s3, s0, 1
+; GFX1250-NEXT: s_cmp_ge_u32 s1, s2
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: s_cselect_b32 s0, s3, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT: .LBB16_3:
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9]
+; GFX1250-NEXT: s_endpgm
+; GFX1250-NEXT: .LBB16_4:
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: s_branch .LBB16_2
%result = udiv i64 %x, %y
store i64 %result, ptr addrspace(1) %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/cc-sgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/cc-sgpr-limit.ll
index e3fa683..75cc2d85 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-sgpr-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-sgpr-limit.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s
; CHECK: s_add_i32 s0, s0, s1
; CHECK: s_add_i32 s1, s0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/cc-sgpr-over-limit.ll b/llvm/test/CodeGen/AMDGPU/cc-sgpr-over-limit.ll
index 8c34c12..35039d1 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-sgpr-over-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-sgpr-over-limit.ll
@@ -1,6 +1,6 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=verde -o /dev/null %s 2>&1 | FileCheck %s
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=tonga -o /dev/null %s 2>&1 | FileCheck %s
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -o /dev/null %s 2>&1 | FileCheck %s
;CHECK: LLVM ERROR: unable to allocate function argument
define amdgpu_gs { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } @_amdgpu_gs_sgpr_i32 (i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg) {
diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index f78cb0d..b5352be 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -O0 < %s | FileCheck -check-prefix=GCN_DBG %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -O0 < %s | FileCheck -check-prefix=GCN_DBG %s
define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
; GCN-LABEL: test_loop:
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index df35a4e..a92b99a 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -1,7 +1,7 @@
; RUN: opt -S -mtriple=amdgcn-- -codegenprepare < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -codegenprepare < %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
; This particular case will actually be worse in terms of code size
; from sinking into both.
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 9f48c8b..d458167 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX900 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-unaligned-access-mode,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-unaligned-access-mode,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define <2 x half> @chain_hi_to_lo_private() {
; GFX900-LABEL: chain_hi_to_lo_private:
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index 7407fc6..b9caf8e 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
; SI-LABEL: v_clamp_add_src_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 6274b38..5eb6b2f 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
; GFX6-LABEL: v_clamp_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index e4aa01f..8769270 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-xnack -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX9 %s
; RUN: FileCheck --enable-var-scope --check-prefix=DBG %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX10 %s
; RUN: FileCheck --enable-var-scope --check-prefix=DBG %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX11 %s
; RUN: FileCheck --enable-var-scope --check-prefixes=DBG,DBG11 %s < %t
; REQUIRES: asserts
diff --git a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
index 29d9299..9e25f4f 100644
--- a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
declare i1 @llvm.amdgcn.class.f32(float, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll
index 48fa5e9..3c9ded8 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; Check that register coalescer does not create an odd subreg when register tuples
; must be aligned.
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
index 4404f1a..ac8ef48 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
@@ -20,10 +20,10 @@ body: |
; CHECK-LABEL: name: foo1
; CHECK: liveins: $vgpr0_vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1
+ ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
; CHECK-NEXT: S_ENDPGM 0
- INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32
+ INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %0:vgpr_32, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32
undef %2.sub0:vreg_64 = COPY killed %0
%2.sub1:vreg_64 = COPY killed %1
FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -41,10 +41,10 @@ body: |
; CHECK-LABEL: name: foo2
; CHECK: liveins: $vgpr0_vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_32 */, def undef %2.sub0
+ ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0
; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
; CHECK-NEXT: S_ENDPGM 0
- INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32
+ INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 2228234 /* regdef:VGPR_32 */, def %0:vgpr_32
undef %2.sub0:vreg_64 = COPY killed %0
%2.sub1:vreg_64 = COPY killed %1
FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -62,10 +62,10 @@ body: |
; CHECK-LABEL: name: foo3
; CHECK: liveins: $vgpr0_vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1
+ ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
; CHECK-NEXT: S_ENDPGM 0
- INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32
+ INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %1:vgpr_32, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32
undef %2.sub0:vreg_64 = COPY killed %1
%2.sub1:vreg_64 = COPY killed %0
FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -83,10 +83,10 @@ body: |
; CHECK-LABEL: name: foo4
; CHECK: liveins: $vgpr0_vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_32 */, def undef %2.sub0
+ ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0
; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
; CHECK-NEXT: S_ENDPGM 0
- INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32
+ INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 2228234 /* regdef:VGPR_32 */, def %1:vgpr_32
undef %2.sub0:vreg_64 = COPY killed %1
%2.sub1:vreg_64 = COPY killed %0
FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
index 61830f1..d95890d 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-- -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-- -o - %s | FileCheck %s
declare float @llvm.fma.f32(float, float, float)
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index dea9142..f9fae02 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -737,7 +737,7 @@ define i64 @v_add_u64_vop2_literal_32(i64 %x) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1250-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 0x7b ; encoding: [0x00,0x00,0x52,0xd6,0x00,0x01,0xfd,0x03,0x7b,0x00,0x00,0x00]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 0x7b, v[0:1] ; encoding: [0xff,0x00,0x00,0x50,0x7b,0x00,0x00,0x00]
; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%add = add i64 %x, 123
ret i64 %add
@@ -747,7 +747,7 @@ define i64 @v_add_u64_vop2_literal_32(i64 %x) {
; GFX10: codeLenInByte = 28
; GFX1100: codeLenInByte = 32
; GFX1150: codeLenInByte = 32
-; GFX1250: codeLenInByte = 24
+; GFX1250: codeLenInByte = 20
define i64 @v_add_u64_vop2_literal_64(i64 %x) {
; GFX9-LABEL: v_add_u64_vop2_literal_64:
@@ -788,9 +788,7 @@ define i64 @v_add_u64_vop2_literal_64(i64 %x) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
-; GFX1250-NEXT: s_mov_b64 s[0:1], lit64(0x112345678) ; encoding: [0xfe,0x01,0x80,0xbe,0x78,0x56,0x34,0x12,0x01,0x00,0x00,0x00]
-; GFX1250-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
-; GFX1250-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; encoding: [0x00,0x00,0x52,0xd6,0x00,0x01,0x01,0x00]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0x112345678), v[0:1] ; encoding: [0xfe,0x00,0x00,0x50,0x78,0x56,0x34,0x12,0x01,0x00,0x00,0x00]
; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%add = add i64 %x, 4600387192
ret i64 %add
@@ -800,6 +798,6 @@ define i64 @v_add_u64_vop2_literal_64(i64 %x) {
; GFX10: codeLenInByte = 28
; GFX1100: codeLenInByte = 32
; GFX1150: codeLenInByte = 32
-; GFX1250: codeLenInByte = 36
+; GFX1250: codeLenInByte = 24
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; NOT-GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
index 6dc05da..73d0ecd 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
@@ -1,5 +1,5 @@
; RUN: opt -mtriple=amdgcn-- -codegenprepare -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI-LLC %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=SI-LLC %s
; OPT-LABEL: @test(
; OPT: mul nsw i32
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index b937501..c30ce8c 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; Disabled endcf collapse at -O0.
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -O0 -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN-O0 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN-O0 %s
; Note: Breaking large PHIs is disabled to branches from being eliminated (in scc_liveness)
diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
index ce4db2f..2558da4 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1010 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1100 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX1100 %s
; Test that unused lanes in the s_xor result are masked out with v_cndmask.
diff --git a/llvm/test/CodeGen/AMDGPU/combine-and-sext-bool.ll b/llvm/test/CodeGen/AMDGPU/combine-and-sext-bool.ll
index 4b0fc93..fe8a14c 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-and-sext-bool.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-and-sext-bool.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @and_i1_sext_bool(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: and_i1_sext_bool:
diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index ba8abdc..3d315f8 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx902 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx902 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: add1:
diff --git a/llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll b/llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll
index 2cbd1b4..8a01964 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}combine_ftrunc_frint_f64:
; GCN: v_rndne_f64_e32 [[RND:v\[[0-9:]+\]]],
diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
index 93b5f15..211174a 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) {
; GCN-LABEL: vectorLoadCombine:
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 1d20218..57a1e4c 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck %s -check-prefixes=GCN,GFX11,GFX11-TRUE16
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck %s -check-prefixes=GCN,GFX11,GFX11-FAKE16
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 -enable-no-nans-fp-math < %s | FileCheck %s -check-prefixes=GCN,GFX11NONANS,GCN-TRUE16,GFX11NONANS-TRUE16
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 -enable-no-nans-fp-math < %s | FileCheck %s -check-prefixes=GCN,GFX11NONANS,GCN-FAKE16,GFX11NONANS-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s -check-prefixes=GCN,GFX11,GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s -check-prefixes=GCN,GFX11,GFX11-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -enable-no-nans-fp-math < %s | FileCheck %s -check-prefixes=GCN,GFX11NONANS,GCN-TRUE16,GFX11NONANS-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -enable-no-nans-fp-math < %s | FileCheck %s -check-prefixes=GCN,GFX11NONANS,GCN-FAKE16,GFX11NONANS-FAKE16
; The tests check the following optimization of DAGCombiner:
; CMP(A,C)||CMP(B,C) => CMP(MIN/MAX(A,B), C)
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll b/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
index cc29152..9286dd8 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1150 -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -amdgpu-sdwa-peephole=0 < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1150 -amdgpu-sdwa-peephole=0 < %s | FileCheck -check-prefix=GISEL %s
define amdgpu_vs void @fcmp_f32_olt_to_ogt(ptr addrspace(1) inreg %out, float inreg %a) {
; SDAG-LABEL: fcmp_f32_olt_to_ogt:
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index ae8080c..ce46094 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -amdgpu-sdwa-peephole=0 < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
index 820ccb1..d1fe78d 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
; SI-LABEL: main:
diff --git a/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll b/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll
index d9eca0d..076468e 100644
--- a/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare float @llvm.fabs.f32(float) #1
diff --git a/llvm/test/CodeGen/AMDGPU/computeKnownBits-scalar-to-vector-crash.ll b/llvm/test/CodeGen/AMDGPU/computeKnownBits-scalar-to-vector-crash.ll
index e15e701..b93ece7 100644
--- a/llvm/test/CodeGen/AMDGPU/computeKnownBits-scalar-to-vector-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/computeKnownBits-scalar-to-vector-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx802 < %s | FileCheck %s
; CHECK: s_waitcnt
define <2 x i16> @main(<2 x float>) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/concat_vectors.ll b/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
index 9e08a04..7fd15fe 100644
--- a/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,VI %s
; GCN-LABEL: {{^}}test_concat_v1i32:
; 0x80f000 is the high 32 bits of the resource descriptor used by MUBUF
diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 080fe12..150f667 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=verde < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}fold_mi_v_and_0:
; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index b81392d..3d5add1 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VMEM -check-prefix=GCN %s
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -amdgpu-spill-sgpr-to-vgpr=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VGPR -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -enable-var-scope -check-prefix=VMEM -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -amdgpu-spill-sgpr-to-vgpr=1 < %s | FileCheck -enable-var-scope -check-prefix=VGPR -check-prefix=GCN %s
; Verify registers used for tracking exec mask changes when all
; registers are spilled at the end of the block. The SGPR spill
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll
index 0fe857b..d22214f 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
; optnone disables AMDGPUAnnotateUniformValues, so no branch is known
; to be uniform during instruction selection. The custom selection for
diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
index 238f6ab..61d102d 100644
--- a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
+++ b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
@@ -1,6 +1,6 @@
-; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
-; RUN: llc -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
+; RUN: llc -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL
; CHECK-LABEL: name: basic_call
; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY
diff --git a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
index 0574de3..f94d6bd 100644
--- a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
+++ b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
; GCN-LABEL: {{^}}convergent_inlineasm:
diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
index ed0a97c..1f4e200 100644
--- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(4) %addrSrc) {
; GCN-LABEL: copy_to_scc:
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 93cb11b..38c20c7 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; SelectionDAG builder was using the IR value kind to decide how to
; split the types for copyToRegs/copyFromRegs in all contexts. This
diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
index 7aca63d..f351b8b 100644
--- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
+++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=GCN
define i32 @test(i32 %val, i32 %cond) {
; GCN-LABEL: test:
diff --git a/llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll b/llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll
index 04483ba..6290424 100644
--- a/llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll
+++ b/llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 | FileCheck %s
; Check that the redundant immediate MOV instruction
; (by-product of handling phi nodes) is not found
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 52c9081..f6cd3d1 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI
-; RUN: llc < %s -mtriple=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -enable-var-scope --check-prefix=SI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s -enable-var-scope --check-prefix=VI
+; RUN: llc < %s -mtriple=r600 -mcpu=cypress | FileCheck %s -enable-var-scope --check-prefix=EG
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -enable-var-scope --check-prefix=GFX10
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16
declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 99b7c773..4b151b9 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=EG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-GISEL %s
declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
declare <2 x i7> @llvm.ctlz.v2i7(<2 x i7>, i1) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop.ll b/llvm/test/CodeGen/AMDGPU/ctpop.ll
index e1d2009..237eefe 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index fb418af..1b9b508 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG %s
declare i16 @llvm.ctpop.i16(i16) nounwind readnone
declare <2 x i16> @llvm.ctpop.v2i16(<2 x i16>) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 3504546..37f5889 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=VI %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 7f83fc57..d17cdeb 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI
-; RUN: llc < %s -mtriple=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
+; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -enable-var-scope --check-prefix=SI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s -enable-var-scope --check-prefix=VI
+; RUN: llc < %s -mtriple=r600 -mcpu=cypress | FileCheck %s -enable-var-scope --check-prefix=EG
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -enable-var-scope --check-prefix=GFX10
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 73fddb5..137acd34 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=EG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-GISEL %s
declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/cube.ll b/llvm/test/CodeGen/AMDGPU/cube.ll
index 72711df..ea0ebf8 100644
--- a/llvm/test/CodeGen/AMDGPU/cube.ll
+++ b/llvm/test/CodeGen/AMDGPU/cube.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare float @llvm.amdgcn.cubeid(float, float, float) #0
declare float @llvm.amdgcn.cubesc(float, float, float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 86e890b..b5bc09a 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,SI
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx908 -start-before=amdgpu-isel -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti < %s | FileCheck %s -check-prefixes=GCN,SI
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s -check-prefixes=GCN,VI
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx908 -start-before=amdgpu-isel < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll b/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
index c34d669..0974ce9 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -enable-no-nans-fp-math < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare float @llvm.fabs.f32(float) #1
declare float @llvm.floor.f32(float) #1
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll b/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
index d4bafa1..0203b2d 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -enable-no-nans-fp-math < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
declare float @llvm.fabs.f32(float) #1
diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll
index cdf4a88..39af6a05 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @private_load_maybe_divergent(ptr addrspace(4) %k, ptr %flat) {
; GCN-LABEL: private_load_maybe_divergent:
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-shuffle-vecextend-non2.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-shuffle-vecextend-non2.ll
index 09607c9..6c93eeb 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-shuffle-vecextend-non2.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-shuffle-vecextend-non2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; We are only checking that instruction selection can succeed in this case. This
; cut down test results in no instructions, but that's fine.
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
index e285689..9ee41bd 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s
define amdgpu_ps float @_amdgpu_ps_main() #0 {
; GFX10-LABEL: _amdgpu_ps_main:
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
index 0bda7e4..81fda98 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -stop-after=amdgpu-isel -verify-machineinstrs -O0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-- -stop-after=amdgpu-isel -O0 < %s | FileCheck -check-prefix=GCN %s
define i32 @divergent_lshr_and_cmp(i32 %x) {
; GCN-LABEL: name: divergent_lshr_and_cmp
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
index 18b250d..af1c643 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
; Test for a bug where DAGCombiner::ReassociateOps() was creating adds
; with offset in the first operand and base pointers in the second.
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
index 1f7bb76..85180a2 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
define amdgpu_kernel void @select_and1(ptr addrspace(1) %p, i32 %x, i32 %y) {
; GCN-LABEL: select_and1:
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
index 2e84304..60194b6 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
define amdgpu_kernel void @eq_t(float %x) {
; GCN-LABEL: eq_t:
diff --git a/llvm/test/CodeGen/AMDGPU/debug-value.ll b/llvm/test/CodeGen/AMDGPU/debug-value.ll
index 60ffc28..6b2a36c 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug-value.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck %s
%struct.wombat = type { [4 x i32], [4 x i32], [4 x i32] }
diff --git a/llvm/test/CodeGen/AMDGPU/debug-value2.ll b/llvm/test/CodeGen/AMDGPU/debug-value2.ll
index 3a16476..3454831 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value2.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug-value2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
%struct.ShapeData = type { <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32, i64, <4 x float>, i32, i8, i8, i16, i32, i32 }
diff --git a/llvm/test/CodeGen/AMDGPU/debug.ll b/llvm/test/CodeGen/AMDGPU/debug.ll
index 783b3ce..9920076 100644
--- a/llvm/test/CodeGen/AMDGPU/debug.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI %s
; Test for a crash in the custom assembly dump code.
diff --git a/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll b/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll
index 49486ad..b63fff3 100644
--- a/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_default_si:
; GCN: FloatMode: 240
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index 20f48da..c126f9e 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -15,7 +15,7 @@ define internal void @direct() {
; CHECK-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
; CHECK-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8
; CHECK-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8
-; CHECK-NEXT: call void @indirect()
+; CHECK-NEXT: call void [[FP]]()
; CHECK-NEXT: ret void
;
%fptr = alloca ptr, addrspace(5)
@@ -36,5 +36,5 @@ define amdgpu_kernel void @test_direct_indirect_call() {
}
;.
; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll b/llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll
index 730df53..4d969bf 100644
--- a/llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll
+++ b/llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx902 -verify-machineinstrs -stop-after=si-form-memory-clauses < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx902 -stop-after=si-form-memory-clauses < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}name:{{[ ]*}}vector_clause
; GCN: S_LOAD_DWORDX4
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 4cb0d2d..e6c38d2 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -475,28 +475,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -507,7 +500,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1046,10 +1038,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -2667,28 +2659,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -2699,7 +2684,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -3238,10 +3222,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 77b78f1..7ea98a1 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck -check-prefix=GISEL %s
define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-LABEL: v_sdiv_v2i128_vv:
diff --git a/llvm/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll b/llvm/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll
index 8dfce73..40ab750 100644
--- a/llvm/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx810 -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx810 | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 | FileCheck -check-prefixes=GCN,GFX9 %s
; A test case that originally failed in divergence calculation
; Implementation has to identify all formal args that can be a source of divergence
@@ -10,7 +10,7 @@
; GCN-LABEL: {{^}}_amdgpu_vs_main:
; GCN-NOT: v_readfirstlane
; PRE-GFX9: flat_load_dword
-; GFX9: global_load
+; GFX9: global_load
define dllexport amdgpu_vs void @_amdgpu_vs_main(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8) local_unnamed_addr #0 {
.entry:
%tmp = add i32 %arg4, %arg8
diff --git a/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll b/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll
index ed92bf3..7cabb71 100644
--- a/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx810 -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx810 | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 | FileCheck -check-prefixes=GCN,GFX9 %s
; Testing for failures in divergence calculations when divergent intrinsic is lowered during instruction selection
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll
index 3e198b6..a896b9e 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: @bfe_uniform
; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010
diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
index 827cb4a..8c3d20f 100644
--- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -4,7 +4,7 @@
; checks are looking for the absence of specific metadata, which
; cannot be expressed reliably by the generated checks.
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=ISA
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=ISA
; RUN: opt --amdgpu-annotate-uniform -S %s | FileCheck %s -check-prefix=UNIFORM
; RUN: opt --amdgpu-annotate-uniform --si-annotate-control-flow -S %s | FileCheck %s -check-prefix=CONTROLFLOW
diff --git a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
index 402a2943..bf37ccf 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64,GFX942
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64,GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX11
; GCN-LABEL: {{^}}dpp64_ceil:
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
index 926c2a3..539485d 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16
; GCN-LABEL: {{^}}dpp_add:
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
diff --git a/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll b/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
index 91962c1..6945d3a 100644
--- a/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN %s
; The memory operand was dropped from the buffer_load_dword_offset
; when replaced with the addr64 during operand legalization, resulting
diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll
index 9712c62..842b912 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: ds_read32_combine_stride_400:
; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
index 418023b..0497542 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN %s
; There is no dependence between the store and the two loads. So we can combine
diff --git a/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
index 26418b0..397f5ad 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -mattr=+load-store-opt,+unsafe-ds-offset-folding < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire -mattr=+load-store-opt < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn -mattr=+load-store-opt,+unsafe-ds-offset-folding < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare void @llvm.amdgcn.s.barrier() #1
diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 9cf9d81..dcf5179 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire < %s | FileCheck -enable-var-scope --check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll
index 302b351..46ba8cb 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --enable-var-scope --check-prefix=GCN %s
; Check that vectorizer does not create slow misaligned loads
diff --git a/llvm/test/CodeGen/AMDGPU/ds_gws_align.ll b/llvm/test/CodeGen/AMDGPU/ds_gws_align.ll
index 220f82f..53bca0c 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_gws_align.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_gws_align.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -early-live-intervals -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -early-live-intervals -o - < %s | FileCheck --check-prefixes=GCN,GFX908 %s
; GCN-LABEL: {{^}}gws_init_odd_reg:
; GFX908-DAG: ds_gws_init v1 gds
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index d95f528..9f1b55e 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
; FIXME: We don't get cases where the address was an SGPR because we
; get a copy to the address register for each one.
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index 9b85ad2..739aad3 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -strict-whitespace -check-prefix=SI %s
@lds = addrspace(3) global [512 x float] poison, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index 5a8521b..37f56aa 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt,-enable-ds128 < %s | FileCheck --check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire -mattr=+load-store-opt,-enable-ds128 < %s | FileCheck --check-prefix=CI %s
@lds = addrspace(3) global [512 x float] poison, align 4
@lds.v2 = addrspace(3) global [512 x <2 x float>] poison, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
index cc68ff3..1c425d1 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
@lds = addrspace(3) global [512 x float] poison, align 4
@lds.f64 = addrspace(3) global [512 x double] poison, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 41e3d5f..91bd837 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
@lds = addrspace(3) global [512 x float] poison, align 4
@lds.f64 = addrspace(3) global [512 x double] poison, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
index b2f6f24..502d4bb 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
@lds = addrspace(3) global [512 x float] poison, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll
index 7c4b471..04d5913 100644
--- a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll
+++ b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GCN
; This is a slightly modified IR from real case to make it concise.
define amdgpu_ps void @_amdgpu_ps_main(i32 inreg %PrimMask, <2 x float> %InterpCenter) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index 38d4998..d646460 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 {
; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8
; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8
-; ATTRIBUTOR_GCN-NEXT: call void @indirect()
+; ATTRIBUTOR_GCN-NEXT: call void [[FP]]()
; ATTRIBUTOR_GCN-NEXT: ret void
;
%fptr = alloca ptr, addrspace(5)
@@ -28,7 +28,6 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 {
attributes #0 = { "amdgpu-no-dispatch-id" }
;.
-;.
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
index 1b72a97..6cc0c03 100644
--- a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCNX3 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GCNX3 %s
; FIXME: Most of these cases that don't trigger because of broken cost
; heuristics. Should not need -stress-early-ifcvt
diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
index cc7460e..8acfdb0 100644
--- a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -amdgpu-codegenprepare-break-large-phis=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; XUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -check-prefix=GCN %s
+; XUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 < %s | FileCheck -check-prefix=GCN %s
; Note: breaking up large PHIs is disabled to prevent some testcases from becoming
; branchless.
diff --git a/llvm/test/CodeGen/AMDGPU/elf.ll b/llvm/test/CodeGen/AMDGPU/elf.ll
index f51d9fc..28a87b0 100644
--- a/llvm/test/CodeGen/AMDGPU/elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
+; RUN: llc < %s -mtriple=amdgcn -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
; Test that we don't try to produce a COFF file on windows
-; RUN: llc < %s -mtriple=amdgcn-pc-mingw -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn-pc-mingw -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
; ELF: Format: elf64-amdgpu
; ELF: OS/ABI: SystemV (0x0)
diff --git a/llvm/test/CodeGen/AMDGPU/else.ll b/llvm/test/CodeGen/AMDGPU/else.ll
index 884f530..4576c19 100644
--- a/llvm/test/CodeGen/AMDGPU/else.ll
+++ b/llvm/test/CodeGen/AMDGPU/else.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s
; CHECK-LABEL: {{^}}else_no_execfix:
; CHECK: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/empty-function.ll b/llvm/test/CodeGen/AMDGPU/empty-function.ll
index dba5122..088effc 100644
--- a/llvm/test/CodeGen/AMDGPU/empty-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/empty-function.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
; Make sure we don't assert on empty functions
diff --git a/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll b/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll
index 00c5e0a..f961282 100644
--- a/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn | FileCheck %s
; This tests that the llvm.SI.end.cf intrinsic is not inserted into the
; loop block. This intrinsic will be lowered to s_or_b64 by the code
diff --git a/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll b/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
index 7fbd6eb..f63f2bc 100644
--- a/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not llc -mtriple=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERROR %s
; ERROR: error: <unknown>:0:0: scalar registers (106) exceeds limit (104) in function 'use_too_many_sgprs_tahiti'
define amdgpu_kernel void @use_too_many_sgprs_tahiti() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll b/llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll
index 45fea2e..72de1df 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}expand_atomicrmw_agent:
; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll b/llvm/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
index 2e9bfc8..1cff873 100644
--- a/llvm/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=GCN
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=GCN
; GCN-LABEL: and_zext:
; GCN: v_and_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/extload-align.ll b/llvm/test/CodeGen/AMDGPU/extload-align.ll
index 032b4fe..249038b 100644
--- a/llvm/test/CodeGen/AMDGPU/extload-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload-align.ll
@@ -1,4 +1,4 @@
-; RUN: llc -debug-only=machine-scheduler -mtriple=amdgcn-- -verify-machineinstrs %s -o - 2>&1| FileCheck -check-prefix=DEBUG %s
+; RUN: llc -debug-only=machine-scheduler -mtriple=amdgcn-- %s -o - 2>&1| FileCheck -check-prefix=DEBUG %s
; REQUIRES: asserts
; Verify that the extload generated from %eval has the default
diff --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll
index 3802dc5..71eaecd 100644
--- a/llvm/test/CodeGen/AMDGPU/extload-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}load_i8_sext_private:
; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/extload.ll b/llvm/test/CodeGen/AMDGPU/extload.ll
index bdeef35..54a6919 100644
--- a/llvm/test/CodeGen/AMDGPU/extload.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
; FIXME: This seems to not ever actually become an extload
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 5e637ba..89bd5f1 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; Loosely based on test/CodeGen/{X86,AArch64}/extract-lowbits.ll,
; but with all 64-bit tests, and tests with loads dropped.
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index a07f1d8..555adec 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-- -o - %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -o - %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -o - %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i32 inreg %cond.arg) {
; SI-LABEL: vec_8xi16_extract_4xi16:
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-equal-length.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-equal-length.ll
index 4cd3959..3ca41b0 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-equal-length.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-equal-length.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck %s
; Test for ICE in SelectionDAG::computeKnownBits when visiting EXTRACT_SUBVECTOR
; with DemandedElts already as wide as the source vector.
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
index a8d9414..1c68773 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-- -o - %s | FileCheck -check-prefix=GCN %s
define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
; GCN-LABEL: extract_2xi16:
diff --git a/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll b/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
index cca0dd6..d1c74fe 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}store_build_vector_multiple_uses_v4i32:
; GCN: buffer_load_dword
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 35fe6eb..dcfac6f 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
; SI-LABEL: extract_vector_elt_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
index e8efe0b..9201f60 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; GCN-LABEL: {{^}}extract_vector_elt_v3f64_2:
; GCN: buffer_load_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index 12b26cb..625ac12 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s
define amdgpu_kernel void @extract_vector_elt_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
;
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
index 6b6f6ff..eb0ed5e 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
; How the replacement of i64 stores with v2i32 stores resulted in
; breaking other users of the bitcast if they already existed
diff --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
index 2a847e0..55371f9 100644
--- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s -check-prefix=GCN
+; RUN: llc < %s -mtriple=amdgcn-- | FileCheck %s -check-prefix=GCN
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 7b6a363..27cf49a 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
; DAGCombiner will transform:
; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
index 5130ec3..5d45f67 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
index c53c1be..13206ad 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
-; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
-; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
-; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
+; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
+; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
; FIXME: This should also fold when fma is actually fast if an FMA
; exists in the original program.
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index fc3624c..e57f0b6 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16,-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16,-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-real-true16,-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-FAKE16-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-real-true16,-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-FAKE16-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-misched=false < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-misched=false < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16,-flat-for-global -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16,-flat-for-global -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-real-true16,-flat-for-global -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-FAKE16-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-real-true16,-flat-for-global -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-FAKE16-GISEL %s
define amdgpu_kernel void @fadd_f16(
; SI-LABEL: fadd_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.ll b/llvm/test/CodeGen/AMDGPU/fadd.ll
index e31f875..e363cc8 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
; FUNC-LABEL: {{^}}fadd_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fadd64.ll b/llvm/test/CodeGen/AMDGPU/fadd64.ll
index 1d3a16e..27c4909 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s
; CHECK-LABEL: {{^}}v_fadd_f64:
; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll
new file mode 100644
index 0000000..85e7038
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefix=GCN %s
+
+define float @test_canonicalize_amdgcn_tanh_f32(float %a) {
+; GCN-LABEL: test_canonicalize_amdgcn_tanh_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_tanh_f32_e32 v0, v0
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %tanh = call float @llvm.amdgcn.tanh.f32(float %a)
+ %canonicalized = call float @llvm.canonicalize.f32(float %tanh)
+ ret float %canonicalized
+}
+
+define bfloat @test_canonicalize_amdgcn_tanh_bf16(bfloat %a) {
+; GCN-LABEL: test_canonicalize_amdgcn_tanh_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_tanh_bf16_e32 v0, v0
+; GCN-NEXT: v_nop
+; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_max_num_f32_e32 v0, v0, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %tanh = call bfloat @llvm.amdgcn.tanh.bf16(bfloat %a)
+ %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %tanh)
+ ret bfloat %canonicalized
+}
+
+define half @test_canonicalize_amdgcn_tanh_f16(half %a) {
+; GCN-LABEL: test_canonicalize_amdgcn_tanh_f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_tanh_f16_e32 v0, v0
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ %tanh = call half @llvm.amdgcn.tanh.f16(half %a)
+ %canonicalized = call half @llvm.canonicalize.f16(half %tanh)
+ ret half %canonicalized
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index ab476dd..ab51693 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH %s
; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 9ef4858..7524750 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=kaveri < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
declare half @llvm.fabs.f16(half) #0
declare half @llvm.canonicalize.f16(half) #0
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index bc54104..d32b528 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX6 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX8 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GFX678,GFX6 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GFX678,GFX8 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
declare float @llvm.fabs.f32(float) #0
declare float @llvm.canonicalize.f32(float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/fceil.ll b/llvm/test/CodeGen/AMDGPU/fceil.ll
index 193ab95..1edb542 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare float @llvm.ceil.f32(float) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll
index 367bbe7..bd1f98a 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=CI -check-prefix=FUNC %s
declare double @llvm.ceil.f64(double) nounwind readnone
declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index 1d83d33..167bcab 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_kernel void @fcmp_f16_lt(
; SI-LABEL: fcmp_f16_lt:
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp64.ll b/llvm/test/CodeGen/AMDGPU/fcmp64.ll
index ff1d82b..e7729649 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp64.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s
; CHECK-LABEL: {{^}}flt_f64:
; CHECK: v_cmp_nge_f64_e32 vcc, {{s\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/llvm/test/CodeGen/AMDGPU/fconst64.ll b/llvm/test/CodeGen/AMDGPU/fconst64.ll
index ab5a389..337b545 100644
--- a/llvm/test/CodeGen/AMDGPU/fconst64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fconst64.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s
; CHECK: {{^}}fconst_f64:
; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll
new file mode 100644
index 0000000..01ebe7d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll
@@ -0,0 +1,298 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s
+
+/* TODO: Support safe bf16 fdiv lowering.
+define bfloat @v_fdiv_bf16(bfloat %x, bfloat %y) {
+ %fdiv = fdiv bfloat %x, %y
+ ret bfloat %fdiv
+}
+*/
+
+define bfloat @v_rcp_bf16(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rcp_bf16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rcp_bf16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fdiv = fdiv bfloat 1.0, %x
+ ret bfloat %fdiv
+}
+
+define bfloat @v_rcp_bf16_abs(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rcp_bf16_abs:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, |v0.l|
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rcp_bf16_abs:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, |v0|
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fabs = call bfloat @llvm.fabs.bf16(bfloat %x)
+ %fdiv = fdiv bfloat 1.0, %fabs
+ ret bfloat %fdiv
+}
+
+define bfloat @v_rcp_bf16_afn(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rcp_bf16_afn:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rcp_bf16_afn:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fdiv = fdiv afn bfloat 1.0, %x
+ ret bfloat %fdiv
+}
+
+define bfloat @v_rcp_bf16_neg(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rcp_bf16_neg:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rcp_bf16_neg:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fdiv = fdiv bfloat -1.0, %x
+ ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_rsq_bf16(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+ %fdiv = fdiv contract bfloat 1.0, %sqrt
+ ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_rsq_bf16_neg(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16_neg:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16_neg:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+ %fdiv = fdiv contract bfloat -1.0, %sqrt
+ ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define <2 x bfloat> @v_rsq_bf16_multi_use(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16_multi_use:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v1.l
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v1.h, v1.l
+; GFX1250-TRUE16-NEXT: v_nop
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16_multi_use:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT: v_nop
+; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+ %fdiv = fdiv contract bfloat 1.0, %sqrt
+ %r = insertelement <2 x bfloat> zeroinitializer, bfloat %x, i32 0
+ %r2 = insertelement <2 x bfloat> %r, bfloat %fdiv, i32 1
+ ret <2 x bfloat> %r2
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_rsq_bf16_missing_contract0(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract0:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract0:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %x)
+ %fdiv = fdiv contract bfloat 1.0, %sqrt
+ ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_rsq_bf16_missing_contract1(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract1:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract1:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+ %fdiv = fdiv bfloat 1.0, %sqrt
+ ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_neg_rsq_bf16_missing_contract1(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_neg_rsq_bf16_missing_contract1:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_neg_rsq_bf16_missing_contract1:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+ %fdiv = fdiv bfloat -1.0, %sqrt
+ ret bfloat %fdiv
+}
+
+define <2 x bfloat> @v_rsq_v2bf16(<2 x bfloat> %a) {
+; GFX1250-TRUE16-LABEL: v_rsq_v2bf16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.h, v0.h
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_v2bf16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT: v_nop
+; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
+ %fdiv = fdiv contract <2 x bfloat> <bfloat 1.0, bfloat 1.0>, %sqrt
+ ret <2 x bfloat> %fdiv
+}
+
+define <2 x bfloat> @v_neg_rsq_v2bf16(<2 x bfloat> %a) {
+; GFX1250-TRUE16-LABEL: v_neg_rsq_v2bf16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.h, -v0.h
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_neg_rsq_v2bf16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v1, -v1
+; GFX1250-FAKE16-NEXT: v_nop
+; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
+ %fdiv = fdiv contract <2 x bfloat> <bfloat -1.0, bfloat -1.0>, %sqrt
+ ret <2 x bfloat> %fdiv
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index c437318..9ae9d19 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
; Make sure fdiv is promoted to f32.
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
index d8c7e33..acb32d4 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
; GCN-LABEL: {{^}}fdiv_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/fdot2.ll b/llvm/test/CodeGen/AMDGPU/fdot2.ll
index b61981b..f2d5ed1 100644
--- a/llvm/test/CodeGen/AMDGPU/fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdot2.ll
@@ -1,11 +1,11 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=GCN,GFX906
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -mattr="+dot7-insts,-dot10-insts" < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED
; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z)
; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
diff --git a/llvm/test/CodeGen/AMDGPU/fence-barrier.ll b/llvm/test/CodeGen/AMDGPU/fence-barrier.ll
index 9f2332c..a991735 100644
--- a/llvm/test/CodeGen/AMDGPU/fence-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/fence-barrier.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llvm-as -data-layout=A5 < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llvm-as -data-layout=A5 < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GCN %s
declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
diff --git a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
index 78bcda7..607ed85 100644
--- a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
@lds = internal addrspace(3) global [576 x double] poison, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll b/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll
index 09e96fe..66cab0b 100644
--- a/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
declare double @llvm.fabs.f64(double %Val)
declare double @llvm.floor.f64(double) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/ffloor.ll b/llvm/test/CodeGen/AMDGPU/ffloor.ll
index dda5c16..ce2d332 100644
--- a/llvm/test/CodeGen/AMDGPU/ffloor.ll
+++ b/llvm/test/CodeGen/AMDGPU/ffloor.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}floor_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
new file mode 100644
index 0000000..ea1ae04
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
@@ -0,0 +1,18 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -passes=finalizebundle-test %s -o - | FileCheck %s
+
+---
+name: test_overlap
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; CHECK-LABEL: name: test_overlap
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr2_vgpr3, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr3_vgpr4, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit $vgpr0_vgpr1, implicit $exec, implicit $vgpr1_vgpr2 {
+ ; CHECK-NEXT: $vgpr2_vgpr3 = V_LSHLREV_B64_pseudo_e32 1, $vgpr0_vgpr1, implicit $exec
+ ; CHECK-NEXT: $vgpr3_vgpr4 = V_LSHLREV_B64_pseudo_e32 1, $vgpr1_vgpr2, implicit $exec
+ ; CHECK-NEXT: }
+ $vgpr2_vgpr3 = V_LSHLREV_B64_pseudo_e32 1, $vgpr0_vgpr1, implicit $exec
+ $vgpr3_vgpr4 = V_LSHLREV_B64_pseudo_e32 1, $vgpr1_vgpr2, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
index 6ce3c68..2e998dd 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=prologepilog -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=prologepilog < %s | FileCheck -check-prefix=GCN %s
; It is a small loop test that iterates over the array member of the structure argument passed byval to the function.
; The loop code will keep the prologue and epilogue blocks apart.
diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index c4063ae..76a2114 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck -check-prefix=GCN %s
; The custom CSR spills inserted during the frame lowering was earlier using SP as the frame base.
; The offsets allocated for the CS objects go wrong when any local stack object has a higher
diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
index 310f32c..c195642 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
index 997432d..4f8dade 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
index f2f8c0a..370b43a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(ptr %ptr, double %data) {
; GFX90A_GFX942-LABEL: name: flat_atomic_fadd_f64_no_rtn_intrinsic
diff --git a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll
index 1732dd0..6bb7cdd 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
define void @flat_inst_offset(ptr nocapture %p) {
; GFX9-LABEL: flat_inst_offset:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index f4040f3..bd4ee03 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -256,17 +256,15 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB10_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB10_4
; GFX1250-SDAG-NEXT: .LBB10_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB10_5
@@ -276,16 +274,15 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB10_2
; GFX1250-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB10_5
@@ -307,11 +304,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB10_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB10_4
; GFX1250-GISEL-NEXT: .LBB10_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB10_5
@@ -321,17 +316,14 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2
; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB10_5
@@ -350,22 +342,19 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_4
; GFX1250-SDAG-NEXT: .LBB11_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB11_5
@@ -375,7 +364,6 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB11_2
; GFX1250-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private
@@ -384,7 +372,6 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB11_5
@@ -400,9 +387,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -410,11 +396,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB11_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB11_4
; GFX1250-GISEL-NEXT: .LBB11_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB11_5
@@ -424,17 +408,14 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB11_5
@@ -455,12 +436,11 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB12_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB12_4
; GFX1250-SDAG-NEXT: .LBB12_2: ; %atomicrmw.phi
@@ -472,10 +452,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB12_2
; GFX1250-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off
@@ -495,7 +475,6 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_4
; GFX1250-GISEL-NEXT: .LBB12_2: ; %atomicrmw.phi
@@ -507,12 +486,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2
; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -529,17 +506,15 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_4
; GFX1250-SDAG-NEXT: .LBB13_2: ; %atomicrmw.phi
@@ -551,7 +526,6 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB13_2
; GFX1250-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private
@@ -569,16 +543,14 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_4
; GFX1250-GISEL-NEXT: .LBB13_2: ; %atomicrmw.phi
@@ -590,12 +562,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2
; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -676,17 +646,15 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB18_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB18_4
; GFX1250-SDAG-NEXT: .LBB18_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB18_5
@@ -696,18 +664,17 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB18_2
; GFX1250-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB18_5
; GFX1250-SDAG-NEXT: .LBB18_5:
@@ -728,11 +695,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB18_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB18_4
; GFX1250-GISEL-NEXT: .LBB18_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB18_5
@@ -742,19 +707,16 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2
; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB18_5
; GFX1250-GISEL-NEXT: .LBB18_5:
@@ -772,22 +734,19 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_4
; GFX1250-SDAG-NEXT: .LBB19_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB19_5
@@ -797,7 +756,6 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB19_2
; GFX1250-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private
@@ -805,10 +763,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB19_5
; GFX1250-SDAG-NEXT: .LBB19_5:
@@ -823,9 +780,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -833,11 +789,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB19_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB19_4
; GFX1250-GISEL-NEXT: .LBB19_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB19_5
@@ -847,19 +801,16 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2
; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB19_5
; GFX1250-GISEL-NEXT: .LBB19_5:
@@ -879,12 +830,11 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB20_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB20_4
; GFX1250-SDAG-NEXT: .LBB20_2: ; %atomicrmw.phi
@@ -896,15 +846,15 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB20_2
; GFX1250-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -922,7 +872,6 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_4
; GFX1250-GISEL-NEXT: .LBB20_2: ; %atomicrmw.phi
@@ -934,16 +883,14 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2
; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -959,17 +906,15 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_4
; GFX1250-SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi
@@ -981,7 +926,6 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB21_2
; GFX1250-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private
@@ -989,7 +933,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -1002,16 +946,14 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_4
; GFX1250-GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi
@@ -1023,16 +965,14 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB21_2
; GFX1250-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -1112,17 +1052,15 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB26_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB26_4
; GFX1250-SDAG-NEXT: .LBB26_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB26_5
@@ -1131,21 +1069,18 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2
; GFX1250-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB26_5
; GFX1250-SDAG-NEXT: .LBB26_5:
@@ -1166,11 +1101,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB26_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB26_4
; GFX1250-GISEL-NEXT: .LBB26_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB26_5
@@ -1179,22 +1112,17 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2
; GFX1250-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB26_5
; GFX1250-GISEL-NEXT: .LBB26_5:
@@ -1212,22 +1140,19 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_4
; GFX1250-SDAG-NEXT: .LBB27_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB27_5
@@ -1236,8 +1161,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2
; GFX1250-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private
@@ -1245,12 +1169,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB27_5
; GFX1250-SDAG-NEXT: .LBB27_5:
@@ -1265,9 +1186,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1275,11 +1195,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB27_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB27_4
; GFX1250-GISEL-NEXT: .LBB27_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB27_5
@@ -1288,22 +1206,17 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2
; GFX1250-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB27_5
; GFX1250-GISEL-NEXT: .LBB27_5:
@@ -1323,12 +1236,11 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB28_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB28_4
; GFX1250-SDAG-NEXT: .LBB28_2: ; %atomicrmw.phi
@@ -1338,19 +1250,17 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB28_2
; GFX1250-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -1368,7 +1278,6 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_4
; GFX1250-GISEL-NEXT: .LBB28_2: ; %atomicrmw.phi
@@ -1378,20 +1287,16 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB28_2
; GFX1250-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -1407,17 +1312,15 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_4
; GFX1250-SDAG-NEXT: .LBB29_2: ; %atomicrmw.phi
@@ -1427,9 +1330,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB29_2
; GFX1250-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private
@@ -1437,9 +1339,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -1452,16 +1352,14 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_4
; GFX1250-GISEL-NEXT: .LBB29_2: ; %atomicrmw.phi
@@ -1471,20 +1369,16 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB29_2
; GFX1250-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -1564,17 +1458,15 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB34_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB34_4
; GFX1250-SDAG-NEXT: .LBB34_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB34_5
@@ -1584,10 +1476,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB34_2
; GFX1250-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -1596,7 +1488,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB34_5
; GFX1250-SDAG-NEXT: .LBB34_5:
@@ -1617,11 +1508,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB34_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB34_4
; GFX1250-GISEL-NEXT: .LBB34_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB34_5
@@ -1631,12 +1520,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB34_2
; GFX1250-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -1644,7 +1531,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: v_and_b32_e32 v3, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB34_5
; GFX1250-GISEL-NEXT: .LBB34_5:
@@ -1662,22 +1548,19 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_4
; GFX1250-SDAG-NEXT: .LBB35_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB35_5
@@ -1687,7 +1570,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB35_2
; GFX1250-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private
@@ -1699,7 +1581,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB35_5
; GFX1250-SDAG-NEXT: .LBB35_5:
@@ -1714,9 +1595,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1724,11 +1604,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB35_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB35_4
; GFX1250-GISEL-NEXT: .LBB35_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB35_5
@@ -1738,12 +1616,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB35_2
; GFX1250-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -1751,7 +1627,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_and_b32_e32 v3, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB35_5
; GFX1250-GISEL-NEXT: .LBB35_5:
@@ -1771,12 +1646,11 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB36_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB36_4
; GFX1250-SDAG-NEXT: .LBB36_2: ; %atomicrmw.phi
@@ -1788,10 +1662,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB36_2
; GFX1250-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -1815,7 +1689,6 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_4
; GFX1250-GISEL-NEXT: .LBB36_2: ; %atomicrmw.phi
@@ -1827,12 +1700,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB36_2
; GFX1250-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -1853,17 +1724,15 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_4
; GFX1250-SDAG-NEXT: .LBB37_2: ; %atomicrmw.phi
@@ -1875,7 +1744,6 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB37_2
; GFX1250-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private
@@ -1897,16 +1765,14 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_4
; GFX1250-GISEL-NEXT: .LBB37_2: ; %atomicrmw.phi
@@ -1918,12 +1784,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB37_2
; GFX1250-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2008,17 +1872,15 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB42_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB42_4
; GFX1250-SDAG-NEXT: .LBB42_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB42_5
@@ -2028,10 +1890,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB42_2
; GFX1250-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -2040,7 +1902,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: v_or_b32_e32 v2, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB42_5
; GFX1250-SDAG-NEXT: .LBB42_5:
@@ -2061,11 +1922,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB42_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB42_4
; GFX1250-GISEL-NEXT: .LBB42_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB42_5
@@ -2075,12 +1934,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB42_2
; GFX1250-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2088,7 +1945,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: v_or_b32_e32 v3, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB42_5
; GFX1250-GISEL-NEXT: .LBB42_5:
@@ -2106,22 +1962,19 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_4
; GFX1250-SDAG-NEXT: .LBB43_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB43_5
@@ -2131,7 +1984,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB43_2
; GFX1250-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private
@@ -2143,7 +1995,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: v_or_b32_e32 v2, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB43_5
; GFX1250-SDAG-NEXT: .LBB43_5:
@@ -2158,9 +2009,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2168,11 +2018,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB43_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB43_4
; GFX1250-GISEL-NEXT: .LBB43_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB43_5
@@ -2182,12 +2030,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB43_2
; GFX1250-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2195,7 +2041,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: v_or_b32_e32 v3, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB43_5
; GFX1250-GISEL-NEXT: .LBB43_5:
@@ -2215,12 +2060,11 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB44_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB44_4
; GFX1250-SDAG-NEXT: .LBB44_2: ; %atomicrmw.phi
@@ -2232,10 +2076,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB44_2
; GFX1250-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -2259,7 +2103,6 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_4
; GFX1250-GISEL-NEXT: .LBB44_2: ; %atomicrmw.phi
@@ -2271,12 +2114,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB44_2
; GFX1250-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2297,17 +2138,15 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_4
; GFX1250-SDAG-NEXT: .LBB45_2: ; %atomicrmw.phi
@@ -2319,7 +2158,6 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB45_2
; GFX1250-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private
@@ -2341,16 +2179,14 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_4
; GFX1250-GISEL-NEXT: .LBB45_2: ; %atomicrmw.phi
@@ -2362,12 +2198,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB45_2
; GFX1250-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2452,17 +2286,15 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB50_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB50_4
; GFX1250-SDAG-NEXT: .LBB50_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB50_5
@@ -2472,10 +2304,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB50_2
; GFX1250-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -2484,7 +2316,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB50_5
; GFX1250-SDAG-NEXT: .LBB50_5:
@@ -2505,11 +2336,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB50_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB50_4
; GFX1250-GISEL-NEXT: .LBB50_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB50_5
@@ -2519,12 +2348,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB50_2
; GFX1250-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2532,7 +2359,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB50_5
; GFX1250-GISEL-NEXT: .LBB50_5:
@@ -2550,22 +2376,19 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_4
; GFX1250-SDAG-NEXT: .LBB51_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB51_5
@@ -2575,7 +2398,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB51_2
; GFX1250-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private
@@ -2587,7 +2409,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB51_5
; GFX1250-SDAG-NEXT: .LBB51_5:
@@ -2602,9 +2423,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2612,11 +2432,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB51_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB51_4
; GFX1250-GISEL-NEXT: .LBB51_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB51_5
@@ -2626,12 +2444,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB51_2
; GFX1250-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2639,7 +2455,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB51_5
; GFX1250-GISEL-NEXT: .LBB51_5:
@@ -2659,12 +2474,11 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB52_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB52_4
; GFX1250-SDAG-NEXT: .LBB52_2: ; %atomicrmw.phi
@@ -2676,10 +2490,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB52_2
; GFX1250-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
@@ -2703,7 +2517,6 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_4
; GFX1250-GISEL-NEXT: .LBB52_2: ; %atomicrmw.phi
@@ -2715,12 +2528,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB52_2
; GFX1250-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2741,17 +2552,15 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_4
; GFX1250-SDAG-NEXT: .LBB53_2: ; %atomicrmw.phi
@@ -2763,7 +2572,6 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB53_2
; GFX1250-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private
@@ -2785,16 +2593,14 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_4
; GFX1250-GISEL-NEXT: .LBB53_2: ; %atomicrmw.phi
@@ -2806,12 +2612,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB53_2
; GFX1250-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -2890,17 +2694,15 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_4
; GFX1250-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB58_5
@@ -2910,21 +2712,19 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB58_5
; GFX1250-SDAG-NEXT: .LBB58_5:
@@ -2945,11 +2745,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_4
; GFX1250-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB58_5
@@ -2959,22 +2757,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB58_5
; GFX1250-GISEL-NEXT: .LBB58_5:
@@ -2992,22 +2786,19 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_4
; GFX1250-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB59_5
@@ -3017,7 +2808,6 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private
@@ -3027,11 +2817,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB59_5
; GFX1250-SDAG-NEXT: .LBB59_5:
@@ -3046,9 +2834,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -3056,11 +2843,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_4
; GFX1250-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB59_5
@@ -3070,22 +2855,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB59_5
; GFX1250-GISEL-NEXT: .LBB59_5:
@@ -3105,12 +2886,11 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB60_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB60_4
; GFX1250-SDAG-NEXT: .LBB60_2: ; %atomicrmw.phi
@@ -3121,16 +2901,15 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2
; GFX1250-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -3149,7 +2928,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_4
; GFX1250-GISEL-NEXT: .LBB60_2: ; %atomicrmw.phi
@@ -3160,17 +2938,14 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2
; GFX1250-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -3187,17 +2962,15 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_4
; GFX1250-SDAG-NEXT: .LBB61_2: ; %atomicrmw.phi
@@ -3208,7 +2981,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2
; GFX1250-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private
@@ -3217,7 +2989,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -3231,16 +3002,14 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_4
; GFX1250-GISEL-NEXT: .LBB61_2: ; %atomicrmw.phi
@@ -3251,17 +3020,14 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2
; GFX1250-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -3336,17 +3102,15 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_4
; GFX1250-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB66_5
@@ -3356,21 +3120,19 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB66_5
; GFX1250-SDAG-NEXT: .LBB66_5:
@@ -3391,11 +3153,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_4
; GFX1250-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB66_5
@@ -3405,22 +3165,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB66_5
; GFX1250-GISEL-NEXT: .LBB66_5:
@@ -3438,22 +3194,19 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_4
; GFX1250-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB67_5
@@ -3463,7 +3216,6 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private
@@ -3473,11 +3225,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB67_5
; GFX1250-SDAG-NEXT: .LBB67_5:
@@ -3492,9 +3242,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -3502,11 +3251,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_4
; GFX1250-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB67_5
@@ -3516,22 +3263,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB67_5
; GFX1250-GISEL-NEXT: .LBB67_5:
@@ -3551,12 +3294,11 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB68_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB68_4
; GFX1250-SDAG-NEXT: .LBB68_2: ; %atomicrmw.phi
@@ -3567,16 +3309,15 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2
; GFX1250-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -3595,7 +3336,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_4
; GFX1250-GISEL-NEXT: .LBB68_2: ; %atomicrmw.phi
@@ -3606,17 +3346,14 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2
; GFX1250-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -3633,17 +3370,15 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_4
; GFX1250-SDAG-NEXT: .LBB69_2: ; %atomicrmw.phi
@@ -3654,7 +3389,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2
; GFX1250-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private
@@ -3663,7 +3397,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -3677,16 +3410,14 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_4
; GFX1250-GISEL-NEXT: .LBB69_2: ; %atomicrmw.phi
@@ -3697,17 +3428,14 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2
; GFX1250-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -3782,17 +3510,15 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_4
; GFX1250-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB74_5
@@ -3802,21 +3528,19 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB74_5
; GFX1250-SDAG-NEXT: .LBB74_5:
@@ -3837,11 +3561,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_4
; GFX1250-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB74_5
@@ -3851,22 +3573,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB74_5
; GFX1250-GISEL-NEXT: .LBB74_5:
@@ -3884,22 +3602,19 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_4
; GFX1250-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB75_5
@@ -3909,7 +3624,6 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private
@@ -3919,11 +3633,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB75_5
; GFX1250-SDAG-NEXT: .LBB75_5:
@@ -3938,9 +3650,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -3948,11 +3659,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_4
; GFX1250-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB75_5
@@ -3962,22 +3671,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB75_5
; GFX1250-GISEL-NEXT: .LBB75_5:
@@ -3997,12 +3702,11 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB76_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB76_4
; GFX1250-SDAG-NEXT: .LBB76_2: ; %atomicrmw.phi
@@ -4013,16 +3717,15 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2
; GFX1250-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -4041,7 +3744,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_4
; GFX1250-GISEL-NEXT: .LBB76_2: ; %atomicrmw.phi
@@ -4052,17 +3754,14 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2
; GFX1250-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -4079,17 +3778,15 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_4
; GFX1250-SDAG-NEXT: .LBB77_2: ; %atomicrmw.phi
@@ -4100,7 +3797,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2
; GFX1250-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private
@@ -4109,7 +3805,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -4123,16 +3818,14 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_4
; GFX1250-GISEL-NEXT: .LBB77_2: ; %atomicrmw.phi
@@ -4143,17 +3836,14 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2
; GFX1250-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -4228,17 +3918,15 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_4
; GFX1250-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB82_5
@@ -4248,21 +3936,19 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB82_5
; GFX1250-SDAG-NEXT: .LBB82_5:
@@ -4283,11 +3969,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_4
; GFX1250-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB82_5
@@ -4297,22 +3981,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB82_5
; GFX1250-GISEL-NEXT: .LBB82_5:
@@ -4330,22 +4010,19 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_4
; GFX1250-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB83_5
@@ -4355,7 +4032,6 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private
@@ -4365,11 +4041,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB83_5
; GFX1250-SDAG-NEXT: .LBB83_5:
@@ -4384,9 +4058,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4394,11 +4067,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_4
; GFX1250-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB83_5
@@ -4408,22 +4079,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB83_5
; GFX1250-GISEL-NEXT: .LBB83_5:
@@ -4443,12 +4110,11 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB84_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB84_4
; GFX1250-SDAG-NEXT: .LBB84_2: ; %atomicrmw.phi
@@ -4459,16 +4125,15 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2
; GFX1250-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -4487,7 +4152,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_4
; GFX1250-GISEL-NEXT: .LBB84_2: ; %atomicrmw.phi
@@ -4498,17 +4162,14 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2
; GFX1250-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -4525,17 +4186,15 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_4
; GFX1250-SDAG-NEXT: .LBB85_2: ; %atomicrmw.phi
@@ -4546,7 +4205,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2
; GFX1250-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private
@@ -4555,7 +4213,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -4569,16 +4226,14 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_4
; GFX1250-GISEL-NEXT: .LBB85_2: ; %atomicrmw.phi
@@ -4589,17 +4244,14 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2
; GFX1250-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -4695,17 +4347,15 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB90_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB90_4
; GFX1250-SDAG-NEXT: .LBB90_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB90_5
@@ -4717,20 +4367,18 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB90_2
; GFX1250-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v1, v5 :: v_dual_cndmask_b32 v2, v0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v8, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB90_5
; GFX1250-SDAG-NEXT: .LBB90_5:
@@ -4752,11 +4400,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB90_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB90_4
; GFX1250-GISEL-NEXT: .LBB90_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB90_5
@@ -4768,21 +4414,17 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB90_2
; GFX1250-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v0, v6 :: v_dual_cndmask_b32 v3, v1, v7
; GFX1250-GISEL-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB90_5
; GFX1250-GISEL-NEXT: .LBB90_5:
@@ -4802,22 +4444,19 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_4
; GFX1250-SDAG-NEXT: .LBB91_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB91_5
@@ -4829,7 +4468,6 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB91_2
; GFX1250-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private
@@ -4838,11 +4476,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v1, v5 :: v_dual_cndmask_b32 v2, v0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v8, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB91_5
; GFX1250-SDAG-NEXT: .LBB91_5:
@@ -4858,9 +4494,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4868,11 +4503,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB91_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB91_4
; GFX1250-GISEL-NEXT: .LBB91_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB91_5
@@ -4884,21 +4517,17 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB91_2
; GFX1250-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v0, v6 :: v_dual_cndmask_b32 v3, v1, v7
; GFX1250-GISEL-NEXT: scratch_store_b64 v4, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB91_5
; GFX1250-GISEL-NEXT: .LBB91_5:
@@ -4920,12 +4549,11 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB92_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB92_4
; GFX1250-SDAG-NEXT: .LBB92_2: ; %atomicrmw.phi
@@ -4939,16 +4567,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB92_2
; GFX1250-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_cndmask_b32 v0, v0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -4968,7 +4595,6 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_4
; GFX1250-GISEL-NEXT: .LBB92_2: ; %atomicrmw.phi
@@ -4982,17 +4608,14 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB92_2
; GFX1250-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -5010,17 +4633,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_4
; GFX1250-SDAG-NEXT: .LBB93_2: ; %atomicrmw.phi
@@ -5034,7 +4655,6 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB93_2
; GFX1250-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private
@@ -5043,7 +4663,6 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_cndmask_b32 v0, v0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -5058,16 +4677,14 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_4
; GFX1250-GISEL-NEXT: .LBB93_2: ; %atomicrmw.phi
@@ -5081,17 +4698,14 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB93_2
; GFX1250-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -5164,17 +4778,15 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB98_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB98_4
; GFX1250-SDAG-NEXT: .LBB98_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB98_5
@@ -5183,23 +4795,21 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB98_2
; GFX1250-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB98_5
; GFX1250-SDAG-NEXT: .LBB98_5:
@@ -5220,11 +4830,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB98_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB98_4
; GFX1250-GISEL-NEXT: .LBB98_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB98_5
@@ -5233,25 +4841,21 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB98_2
; GFX1250-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB98_5
; GFX1250-GISEL-NEXT: .LBB98_5:
@@ -5269,22 +4873,19 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_4
; GFX1250-SDAG-NEXT: .LBB99_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB99_5
@@ -5293,7 +4894,6 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB99_2
; GFX1250-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private
@@ -5302,14 +4902,12 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-SDAG-NEXT: s_branch .LBB99_5
; GFX1250-SDAG-NEXT: .LBB99_5:
@@ -5324,9 +4922,8 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5334,11 +4931,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB99_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB99_4
; GFX1250-GISEL-NEXT: .LBB99_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB99_5
@@ -5347,25 +4942,21 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB99_2
; GFX1250-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB99_5
; GFX1250-GISEL-NEXT: .LBB99_5:
@@ -5385,12 +4976,11 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB100_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB100_4
; GFX1250-SDAG-NEXT: .LBB100_2: ; %atomicrmw.phi
@@ -5400,18 +4990,17 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB100_2
; GFX1250-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -5430,7 +5019,6 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_4
; GFX1250-GISEL-NEXT: .LBB100_2: ; %atomicrmw.phi
@@ -5440,19 +5028,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB100_2
; GFX1250-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off
@@ -5470,17 +5055,15 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_4
; GFX1250-SDAG-NEXT: .LBB101_2: ; %atomicrmw.phi
@@ -5490,7 +5073,6 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB101_2
; GFX1250-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private
@@ -5498,10 +5080,9 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
@@ -5515,16 +5096,14 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_4
; GFX1250-GISEL-NEXT: .LBB101_2: ; %atomicrmw.phi
@@ -5534,19 +5113,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB101_2
; GFX1250-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off
@@ -5621,17 +5197,15 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB106_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB106_4
; GFX1250-SDAG-NEXT: .LBB106_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB106_5
@@ -5640,10 +5214,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB106_2
; GFX1250-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5651,9 +5225,8 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], -1, v[0:1]
; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v2, v4, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off
@@ -5678,11 +5251,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB106_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB106_4
; GFX1250-GISEL-NEXT: .LBB106_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB106_5
@@ -5691,21 +5262,18 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB106_2
; GFX1250-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], -1, v[0:1]
; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
@@ -5727,22 +5295,19 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_4
; GFX1250-SDAG-NEXT: .LBB107_2: ; %atomicrmw.phi
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB107_5
@@ -5751,7 +5316,6 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB107_2
; GFX1250-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private
@@ -5762,9 +5326,8 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], -1, v[0:1]
; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v2, v4, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off
@@ -5783,9 +5346,8 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5793,11 +5355,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB107_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB107_4
; GFX1250-GISEL-NEXT: .LBB107_2: ; %atomicrmw.phi
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB107_5
@@ -5806,21 +5366,18 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB107_2
; GFX1250-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], -1, v[0:1]
; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off
@@ -5844,12 +5401,11 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB108_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB108_4
; GFX1250-SDAG-NEXT: .LBB108_2: ; %atomicrmw.phi
@@ -5859,20 +5415,18 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB108_2
; GFX1250-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1]
; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_cndmask_b32 v0, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
@@ -5892,7 +5446,6 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_4
; GFX1250-GISEL-NEXT: .LBB108_2: ; %atomicrmw.phi
@@ -5902,21 +5455,17 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB108_2
; GFX1250-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1]
; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
@@ -5934,17 +5483,15 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_4
; GFX1250-SDAG-NEXT: .LBB109_2: ; %atomicrmw.phi
@@ -5954,7 +5501,6 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB109_2
; GFX1250-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private
@@ -5964,10 +5510,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1]
; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_cndmask_b32 v0, v0, v2
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off
@@ -5982,16 +5526,14 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_4
; GFX1250-GISEL-NEXT: .LBB109_2: ; %atomicrmw.phi
@@ -6001,21 +5543,17 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB109_2
; GFX1250-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1]
; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index f54fbba..2079543 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -95,12 +95,24 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) {
}
define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) {
-; GFX1250-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0xff800000
-; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0xff800000, s2
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388607
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, -1
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967295
%load = load i8, ptr %gep0
%zext = zext i8 %load to i32
@@ -329,7 +341,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %s
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -343,9 +355,8 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %s
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -551,12 +562,21 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff
; Both 64-bit base and 32-bit offset are scalar
define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, i32 inreg %soffset) {
-; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: v_mov_b32_e32 v0, s4
-; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3]
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%load = load i8, ptr %gep0
@@ -567,12 +587,21 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase,
; Both 64-bit base and 32-bit offset are scalar, with immediate offset.
define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inreg %sbase, i32 inreg %soffset) {
-; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: v_mov_b32_e32 v0, s4
-; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-24
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-24
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1] offset:-24
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -24
@@ -584,12 +613,21 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inr
; Both components uniform, zext forced to LHS of addressing expression
define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 inreg %soffset) {
-; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: v_mov_b32_e32 v0, s4
-; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3]
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%sbase.as.int = ptrtoint ptr %sbase to i64
%add = add i64 %zext.offset, %sbase.as.int
@@ -602,12 +640,21 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr in
; Both components uniform, zext forced to LHS of addressing expression, with immediate offset
define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 inreg %soffset) {
-; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: v_mov_b32_e32 v0, s4
-; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1] offset:128
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%sbase.as.int = ptrtoint ptr %sbase to i64
%add = add i64 %zext.offset, %sbase.as.int
@@ -625,7 +672,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffse
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_mov_b32 s3, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1]
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: ; return to shader part epilog
@@ -655,7 +702,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_mov_b32 s3, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388607
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: ; return to shader part epilog
@@ -686,33 +733,13 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3
; Cannot push the shift into 32-bits, and cannot match.
define amdgpu_ps float @flat_load_saddr_f32_natural_addressing(ptr inreg %sbase, ptr %voffset.ptr) {
-; GFX1250-SDAG-LABEL: flat_load_saddr_f32_natural_addressing:
-; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3]
-; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX1250-GISEL-LABEL: flat_load_saddr_f32_natural_addressing:
-; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
-; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
-; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; return to shader part epilog
+; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
%voffset = load i32, ptr %voffset.ptr
%zext.offset = zext i32 %voffset to i64
%gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset
@@ -743,8 +770,7 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range(ptr inreg %sbase, pt
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3]
+; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
%voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{}
@@ -760,8 +786,7 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:400
+; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:400 scale_offset
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
%voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{}
@@ -774,33 +799,13 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg
; Range is 1 beyond the limit where we can move the shift into 32-bits.
define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_too_large(ptr inreg %sbase, ptr %voffset.ptr) {
-; GFX1250-SDAG-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large:
-; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3]
-; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX1250-GISEL-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large:
-; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
-; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
-; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; return to shader part epilog
+; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
%voffset = load i32, ptr %voffset.ptr, !range !1, !noundef !{}
%zext.offset = zext i32 %voffset to i64
%gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset
@@ -2130,11 +2135,10 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 4
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3]
; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
@@ -2188,11 +2192,10 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
-; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 4
+; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3]
; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir
new file mode 100644
index 0000000..e5955ad
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir
@@ -0,0 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -start-before=si-fold-operands -stop-after=prologepilog -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name: test_fold_fi_scratch_load_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: $sgpr32
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 }
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: test_fold_fi_scratch_load_vgpr
+ ; GCN: renamable $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+ ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+ %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+ %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+ S_ENDPGM 0, implicit %1
+
+...
+
+# SS form of the SCRATCH_LOAD_DWORD does not support offset scaling
+
+---
+name: test_no_fold_fi_scratch_load_vgpr_scale_offset
+tracksRegLiveness: true
+machineFunctionInfo:
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: $sgpr32
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 }
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: test_no_fold_fi_scratch_load_vgpr_scale_offset
+ ; GCN: renamable $vgpr0 = V_MOV_B32_e32 $sgpr32, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0 = SCRATCH_LOAD_DWORD killed renamable $vgpr0, 4, 2048, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+ ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+ %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+ %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 2048, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+ S_ENDPGM 0, implicit %1
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
index 844e65d..47910f5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX12 %s
; vgpr offset
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index a98df5c..b0e6752 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -150,13 +150,11 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm
bb:
@@ -321,15 +319,14 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm
bb:
@@ -494,15 +491,14 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm
bb:
@@ -664,17 +660,15 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX12-GISEL-LABEL: soff2_voff1:
; GFX12-GISEL: ; %bb.0: ; %bb
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm
bb:
@@ -850,13 +844,11 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm
bb:
@@ -1032,13 +1024,11 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm
bb:
@@ -1200,17 +1190,15 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX12-GISEL-LABEL: soff4_voff1:
; GFX12-GISEL: ; %bb.0: ; %bb
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm
bb:
@@ -1386,13 +1374,11 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm
bb:
@@ -1565,13 +1551,11 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm
bb:
@@ -1672,9 +1656,7 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:-1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index c79cf87..d7cf411 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s
; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
; beneficial even without fp32 denormals, but they do require no-infs-fp-math
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 0d9c839..fe46ac1 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-SDAG
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-TRUE16
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-FAKE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-TRUE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-FAKE16
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12-SDAG-TRUE16
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12-SDAG-FAKE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12-GISEL-TRUE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12-GISEL-FAKE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,GFX9-SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11-SDAG-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11-GISEL-FAKE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12-SDAG-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12-GISEL-FAKE16
declare half @llvm.fma.f16(half, half, half)
declare half @llvm.maxnum.f16(half, half)
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f64.ll b/llvm/test/CodeGen/AMDGPU/fma.f64.ll
index e448825..3677e26 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,FMA_F64 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,FMA_F64 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,FMAC_F64 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,FMA_F64 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=FUNC,GCN,FMA_F64 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=FUNC,GCN,FMA_F64 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx90a -mattr=-flat-for-global < %s | FileCheck -check-prefixes=FUNC,GCN,FMAC_F64 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=FUNC,GCN,FMA_F64 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1250 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=FUNC,GCN,FMAC_F64 %s
declare double @llvm.fma.f64(double, double, double) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll
index a10856e..c7fadb8 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.ll
@@ -1,13 +1,13 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cedar -verify-machineinstrs < %s
-; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=juniper -verify-machineinstrs < %s
-; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s
-; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=sumo -verify-machineinstrs < %s
-; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=barts -verify-machineinstrs < %s
-; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=caicos -verify-machineinstrs < %s
-; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=turks -verify-machineinstrs < %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefix=GFX906 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cedar < %s
+; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=juniper < %s
+; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s
+; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=sumo < %s
+; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=barts < %s
+; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=caicos < %s
+; RUN: not llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=turks < %s
declare float @llvm.fma.f32(float, float, float) nounwind readnone
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fmac.sdwa.ll b/llvm/test/CodeGen/AMDGPU/fmac.sdwa.ll
index 827e5da..a050a8da07 100644
--- a/llvm/test/CodeGen/AMDGPU/fmac.sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmac.sdwa.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
; GCN-LABEL: {{^}}addMul2D:
; GFX1010: v_fmac_f16
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
index 83a4944..c24b773 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
declare double @llvm.maxnum.f64(double, double) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 86ebf3f..4827f75 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -1,11 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-FAKE16 %s
define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
; SI-LABEL: test_fmax3_olt_0_f32:
@@ -157,6 +159,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: v_max3_num_f32 v0, v0, v1, v2
; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_fmax3_olt_0_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s10, -1
+; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s14, s10
+; GFX1250-NEXT: s_mov_b32 s15, s11
+; GFX1250-NEXT: s_mov_b32 s18, s10
+; GFX1250-NEXT: s_mov_b32 s19, s11
+; GFX1250-NEXT: s_mov_b32 s22, s10
+; GFX1250-NEXT: s_mov_b32 s23, s11
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s12, s2
+; GFX1250-NEXT: s_mov_b32 s13, s3
+; GFX1250-NEXT: s_mov_b32 s16, s4
+; GFX1250-NEXT: s_mov_b32 s17, s5
+; GFX1250-NEXT: s_mov_b32 s20, s6
+; GFX1250-NEXT: s_mov_b32 s21, s7
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s0
+; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -317,6 +349,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: v_max3_num_f32 v0, v2, v0, v1
; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_fmax3_olt_1_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s10, -1
+; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s14, s10
+; GFX1250-NEXT: s_mov_b32 s15, s11
+; GFX1250-NEXT: s_mov_b32 s18, s10
+; GFX1250-NEXT: s_mov_b32 s19, s11
+; GFX1250-NEXT: s_mov_b32 s22, s10
+; GFX1250-NEXT: s_mov_b32 s23, s11
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s12, s2
+; GFX1250-NEXT: s_mov_b32 s13, s3
+; GFX1250-NEXT: s_mov_b32 s16, s4
+; GFX1250-NEXT: s_mov_b32 s17, s5
+; GFX1250-NEXT: s_mov_b32 s20, s6
+; GFX1250-NEXT: s_mov_b32 s21, s7
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s0
+; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: v_max3_num_f32 v0, v2, v0, v1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -544,6 +606,66 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
; GFX12-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
; GFX12-FAKE16-NEXT: s_endpgm
+;
+; GFX1250-TRUE16-LABEL: test_fmax3_olt_0_f16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1
+; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11
+; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11
+; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4
+; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5
+; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6
+; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-TRUE16-NEXT: s_endpgm
+;
+; GFX1250-FAKE16-LABEL: test_fmax3_olt_0_f16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1
+; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11
+; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11
+; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4
+; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5
+; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6
+; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2
+; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-FAKE16-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -772,6 +894,66 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
; GFX12-FAKE16-NEXT: v_max3_num_f16 v0, v2, v0, v1
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
; GFX12-FAKE16-NEXT: s_endpgm
+;
+; GFX1250-TRUE16-LABEL: test_fmax3_olt_1_f16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1
+; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11
+; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11
+; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4
+; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5
+; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6
+; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-TRUE16-NEXT: s_endpgm
+;
+; GFX1250-FAKE16-LABEL: test_fmax3_olt_1_f16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1
+; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11
+; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11
+; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4
+; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5
+; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6
+; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v2, v0, v1
+; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-FAKE16-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -850,6 +1032,15 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v0
; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: no_fmax3_v2f16:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max3_num_f16 v0, v2, v0, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
index 67a9c12..ed48999 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
@@ -1,17 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SAFE %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-NNAN %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI-SAFE %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-NNAN %s
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s
define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 {
; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
index 1da621c..eee2bd1 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-NONAN,GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN-NONAN,GCN,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-NONAN,GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN-NONAN,GCN,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope --check-prefixes=EG,FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.v2f16.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.v2f16.ll
new file mode 100644
index 0000000..852c9cf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.v2f16.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-GISEL %s
+
+define <2 x half> @fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX1250-SDAG-LABEL: fmaximum3_v2f16:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_maximum3_f16 v0, v2, v0, v1
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: fmaximum3_v2f16:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %min = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
+ %res = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %min)
+ ret <2 x half> %res
+}
+
+define <2 x half> @fmaximum3_v2f16_vss(<2 x half> %a, <2 x half> inreg %b, <2 x half> inreg %c) {
+; GFX1250-SDAG-LABEL: fmaximum3_v2f16_vss:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_maximum3_f16 v0, s1, v0, s0
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: fmaximum3_v2f16_vss:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_maximum3_f16 v0, s1, v0, v0
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %min = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
+ %res = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %min)
+ ret <2 x half> %res
+}
+
+define <3 x half> @fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX1250-SDAG-LABEL: fmaximum3_v3f16:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_maximum3_f16 v0, v4, v0, v2
+; GFX1250-SDAG-NEXT: v_pk_maximum3_f16 v1, v5, v1, v3
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: fmaximum3_v3f16:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX1250-GISEL-NEXT: v_maximum_f16 v1, v1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_pk_maximum3_f16 v0, v4, v0, v0
+; GFX1250-GISEL-NEXT: v_maximum_f16 v1, v5, v1
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %min = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
+ %res = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %min)
+ ret <3 x half> %res
+}
+
+define <4 x half> @fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX1250-SDAG-LABEL: fmaximum3_v4f16:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_maximum3_f16 v0, v4, v0, v2
+; GFX1250-SDAG-NEXT: v_pk_maximum3_f16 v1, v5, v1, v3
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: fmaximum3_v4f16:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX1250-GISEL-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_pk_maximum3_f16 v0, v4, v0, v0
+; GFX1250-GISEL-NEXT: v_pk_maximum3_f16 v1, v5, v1, v1
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %min = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
+ %res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %c, <4 x half> %min)
+ ret <4 x half> %res
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index cbb0767..9233f80 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -1,14 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-SDAG %s
-; RUN: llc -mtriple=amdgcn -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -global-isel=0 < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-SDAG %s
+; RUN: llc -mtriple=amdgcn -global-isel=1 < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index d554707..6dfefd8 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -1,11 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-FAKE16 %s
define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
; SI-LABEL: test_fmin3_olt_0_f32:
@@ -157,6 +159,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: v_min3_num_f32 v0, v0, v1, v2
; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_fmin3_olt_0_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s10, -1
+; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s14, s10
+; GFX1250-NEXT: s_mov_b32 s15, s11
+; GFX1250-NEXT: s_mov_b32 s18, s10
+; GFX1250-NEXT: s_mov_b32 s19, s11
+; GFX1250-NEXT: s_mov_b32 s22, s10
+; GFX1250-NEXT: s_mov_b32 s23, s11
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s12, s2
+; GFX1250-NEXT: s_mov_b32 s13, s3
+; GFX1250-NEXT: s_mov_b32 s16, s4
+; GFX1250-NEXT: s_mov_b32 s17, s5
+; GFX1250-NEXT: s_mov_b32 s20, s6
+; GFX1250-NEXT: s_mov_b32 s21, s7
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s0
+; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: v_min3_num_f32 v0, v0, v1, v2
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -317,6 +349,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: v_min3_num_f32 v0, v2, v0, v1
; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_fmin3_olt_1_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s10, -1
+; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s14, s10
+; GFX1250-NEXT: s_mov_b32 s15, s11
+; GFX1250-NEXT: s_mov_b32 s18, s10
+; GFX1250-NEXT: s_mov_b32 s19, s11
+; GFX1250-NEXT: s_mov_b32 s22, s10
+; GFX1250-NEXT: s_mov_b32 s23, s11
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s12, s2
+; GFX1250-NEXT: s_mov_b32 s13, s3
+; GFX1250-NEXT: s_mov_b32 s16, s4
+; GFX1250-NEXT: s_mov_b32 s17, s5
+; GFX1250-NEXT: s_mov_b32 s20, s6
+; GFX1250-NEXT: s_mov_b32 s21, s7
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s0
+; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: v_min3_num_f32 v0, v2, v0, v1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -544,6 +606,66 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
; GFX12-FAKE16-NEXT: v_min3_num_f16 v0, v0, v1, v2
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
; GFX12-FAKE16-NEXT: s_endpgm
+;
+; GFX1250-TRUE16-LABEL: test_fmin3_olt_0_f16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1
+; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11
+; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11
+; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4
+; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5
+; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6
+; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-TRUE16-NEXT: s_endpgm
+;
+; GFX1250-FAKE16-LABEL: test_fmin3_olt_0_f16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1
+; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11
+; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11
+; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4
+; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5
+; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6
+; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-FAKE16-NEXT: v_min3_num_f16 v0, v0, v1, v2
+; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-FAKE16-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -772,6 +894,66 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
; GFX12-FAKE16-NEXT: v_min3_num_f16 v0, v2, v0, v1
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
; GFX12-FAKE16-NEXT: s_endpgm
+;
+; GFX1250-TRUE16-LABEL: test_fmin3_olt_1_f16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1
+; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11
+; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11
+; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10
+; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2
+; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4
+; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5
+; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6
+; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-TRUE16-NEXT: s_endpgm
+;
+; GFX1250-FAKE16-LABEL: test_fmin3_olt_1_f16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1
+; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11
+; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11
+; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10
+; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4
+; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5
+; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6
+; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1
+; GFX1250-FAKE16-NEXT: v_min3_num_f16 v0, v2, v0, v1
+; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-FAKE16-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -850,6 +1032,15 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
; GFX12-NEXT: v_pk_min_num_f16 v0, v2, v0
; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: no_fmin3_v2f16:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_min_num_f16 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_min3_num_f16 v0, v2, v0, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
%min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
@@ -1023,6 +1214,40 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_fmin3_olt_0_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s10, -1
+; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s14, s10
+; GFX1250-NEXT: s_mov_b32 s15, s11
+; GFX1250-NEXT: s_mov_b32 s18, s10
+; GFX1250-NEXT: s_mov_b32 s19, s11
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s12, s2
+; GFX1250-NEXT: s_mov_b32 s13, s3
+; GFX1250-NEXT: s_mov_b32 s16, s4
+; GFX1250-NEXT: s_mov_b32 s17, s5
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: s_mov_b32 s12, s6
+; GFX1250-NEXT: s_mov_b32 s13, s7
+; GFX1250-NEXT: s_mov_b32 s8, s0
+; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT: s_endpgm
%a = load volatile double, ptr addrspace(1) %aptr, align 4
%b = load volatile double, ptr addrspace(1) %bptr, align 4
%c = load volatile double, ptr addrspace(1) %cptr, align 4
@@ -1199,6 +1424,40 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1]
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_fmin3_olt_1_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s10, -1
+; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s14, s10
+; GFX1250-NEXT: s_mov_b32 s15, s11
+; GFX1250-NEXT: s_mov_b32 s18, s10
+; GFX1250-NEXT: s_mov_b32 s19, s11
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s12, s2
+; GFX1250-NEXT: s_mov_b32 s13, s3
+; GFX1250-NEXT: s_mov_b32 s16, s4
+; GFX1250-NEXT: s_mov_b32 s17, s5
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: s_mov_b32 s12, s6
+; GFX1250-NEXT: s_mov_b32 s13, s7
+; GFX1250-NEXT: s_mov_b32 s8, s0
+; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT: s_endpgm
%a = load volatile double, ptr addrspace(1) %aptr, align 4
%b = load volatile double, ptr addrspace(1) %bptr, align 4
%c = load volatile double, ptr addrspace(1) %cptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
index fd809c6..34cb0b1 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
@@ -1,17 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SAFE %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-NNAN %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI-SAFE %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-NNAN %s
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s
define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
index 8e595a8..ec4dd85 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope --check-prefixes=EG,FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.v2f16.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.v2f16.ll
new file mode 100644
index 0000000..df9fb10
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.v2f16.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-GISEL %s
+
+define <2 x half> @fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX1250-SDAG-LABEL: fminimum3_v2f16:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_minimum3_f16 v0, v2, v0, v1
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: fminimum3_v2f16:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_minimum3_f16 v0, v2, v0, v0
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %min = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
+ %res = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %min)
+ ret <2 x half> %res
+}
+
+define <2 x half> @fminimum3_v2f16_vss(<2 x half> %a, <2 x half> inreg %b, <2 x half> inreg %c) {
+; GFX1250-SDAG-LABEL: fminimum3_v2f16_vss:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_minimum3_f16 v0, s1, v0, s0
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: fminimum3_v2f16_vss:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_minimum3_f16 v0, s1, v0, v0
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %min = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
+ %res = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %min)
+ ret <2 x half> %res
+}
+
+define <3 x half> @fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX1250-SDAG-LABEL: fminimum3_v3f16:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_minimum3_f16 v0, v4, v0, v2
+; GFX1250-SDAG-NEXT: v_pk_minimum3_f16 v1, v5, v1, v3
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: fminimum3_v3f16:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX1250-GISEL-NEXT: v_minimum_f16 v1, v1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_pk_minimum3_f16 v0, v4, v0, v0
+; GFX1250-GISEL-NEXT: v_minimum_f16 v1, v5, v1
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %min = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
+ %res = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> %min)
+ ret <3 x half> %res
+}
+
+define <4 x half> @fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX1250-SDAG-LABEL: fminimum3_v4f16:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_minimum3_f16 v0, v4, v0, v2
+; GFX1250-SDAG-NEXT: v_pk_minimum3_f16 v1, v5, v1, v3
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: fminimum3_v4f16:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX1250-GISEL-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_pk_minimum3_f16 v0, v4, v0, v0
+; GFX1250-GISEL-NEXT: v_pk_minimum3_f16 v1, v5, v1, v1
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %min = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
+ %res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %c, <4 x half> %min)
+ ret <4 x half> %res
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index d4471c8..c0f3726 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-DENORM %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-FLUSH %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-DENORM %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-FLUSH %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16,GFX11-DENORM,GFX11-DENORM-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16,GFX11-DENORM,GFX11-DENORM-FAKE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16,GFX11-FLUSH,GFX11-FLUSH-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16,GFX11-FLUSH,GFX11-FLUSH-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=VI,VI-DENORM %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=VI,VI-FLUSH %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX10,GFX10-DENORM %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX10,GFX10-FLUSH %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16,GFX11-DENORM,GFX11-DENORM-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16,GFX11-DENORM,GFX11-DENORM-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16,GFX11-FLUSH,GFX11-FLUSH-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16,GFX11-FLUSH,GFX11-FLUSH-FAKE16 %s
; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't
; make add an instruction if the fadd has more than one use.
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index c16fa2d4..f871993 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @fmul_f16(
; SI-LABEL: fmul_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fmul64.ll b/llvm/test/CodeGen/AMDGPU/fmul64.ll
index 2543c51..bbf33c2 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
; FUNC-LABEL: {{^}}fmul_f64:
; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 0a85623..51b6d17 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -1,23 +1,23 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-FLUSH %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -check-prefixes=VI-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=VI-FLUSH %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-DENORM %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-DENORM,VI-DENORM-CONTRACT %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -check-prefixes=VI-DENORM %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=VI-DENORM,VI-DENORM-CONTRACT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-FLUSH %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-FLUSH %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-STRICT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-CONTRACT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -check-prefixes=GFX10-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=GFX10-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-STRICT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-CONTRACT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM-STRICT-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM-STRICT-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM-CONTRACT-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM-CONTRACT-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -check-prefixes=GFX11-FLUSH-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -check-prefixes=GFX11-FLUSH-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-FLUSH-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-FLUSH-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -check-prefixes=GFX11-DENORM-STRICT-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -check-prefixes=GFX11-DENORM-STRICT-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-DENORM-CONTRACT-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-DENORM-CONTRACT-FAKE16 %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare half @llvm.fmuladd.f16(half, half, half) #1
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
index 945973b..ceacdf5 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
@@ -1,24 +1,24 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-FASTFMA,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-SLOWFMA,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-FASTFMA,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-SLOWFMA,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s
; FIXME: Should probably test this, but sometimes selecting fmac is painful to match.
-; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s
+; XUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx1030 -denormal-fp-math-f32=ieee -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=ieee -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT %s
; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll
index 43f7cd9..c70325f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-STRICT %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-STRICT %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-CONTRACT %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-CONTRACT %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-STRICT %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-CONTRACT %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -fp-contract=on < %s | FileCheck -check-prefixes=SI,SI-STRICT %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -fp-contract=on < %s | FileCheck -check-prefixes=SI,SI-STRICT %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -fp-contract=fast < %s | FileCheck -check-prefixes=SI,SI-CONTRACT %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -fp-contract=fast < %s | FileCheck -check-prefixes=SI,SI-CONTRACT %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on < %s | FileCheck -check-prefixes=VI,VI-STRICT %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast < %s | FileCheck -check-prefixes=VI,VI-CONTRACT %s
define amdgpu_kernel void @fmuladd_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
; SI-LABEL: fmuladd_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
index 0c40fe0..4dafe2d 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
@@ -1,12 +1,12 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-FLUSH %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-FLUSH %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-FLUSH %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-FLUSH %s
-
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-DENORM-STRICT,GFX9-DENORM %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-DENORM-STRICT,GFX9-DENORM %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-DENORM-CONTRACT,GFX9-DENORM %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-DENORM-CONTRACT,GFX9-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GFX9-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GFX9-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9-FLUSH %s
+
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GFX9-DENORM-STRICT,GFX9-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GFX9-DENORM-STRICT,GFX9-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9-DENORM-CONTRACT,GFX9-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9-DENORM-CONTRACT,GFX9-DENORM %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index e9fd611..a025c36 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SICI,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SICI,CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SICI,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=SICI,CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
declare half @llvm.nearbyint.f16(half) #0
declare float @llvm.nearbyint.f32(float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines-gfx1200.ll
index f90b79c..7d1dfae 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines-gfx1200.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -start-before=amdgpu-unify-divergent-exit-nodes --verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -start-before=amdgpu-unify-divergent-exit-nodes --verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -start-before=amdgpu-unify-divergent-exit-nodes -< %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -start-before=amdgpu-unify-divergent-exit-nodes -< %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-GISEL %s
; --------------------------------------------------------------------------------
; fminimum tests
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.si.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.si.ll
index 0ad6106..64af8f6 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.si.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.si.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -start-before=amdgpu-unify-divergent-exit-nodes -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=tahiti -start-before=amdgpu-unify-divergent-exit-nodes -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -start-before=amdgpu-unify-divergent-exit-nodes < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=tahiti -start-before=amdgpu-unify-divergent-exit-nodes < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; --------------------------------------------------------------------------------
; rcp_legacy tests
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index eca8c28..9d9a851 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=CIVI,CI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=CIVI,VI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=CIVI,CI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga < %s | FileCheck --check-prefixes=CIVI,VI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) {
; CI-LABEL: fneg_fabs_fadd_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 98e0b27..cab27fc 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,CI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,GFX8 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mattr=+real-true16 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=kaveri -mtriple=amdgcn--amdhsa < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tonga -mtriple=amdgcn--amdhsa < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,GFX8 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mtriple=amdgcn--amdhsa < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mattr=+real-true16 -mtriple=amdgcn--amdhsa < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn--amdhsa < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
; FIXME: Should be able to do scalar op
define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
index 111e585..8fae960 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}fneg_f64:
; GCN: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
diff --git a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll
index 1c6ab3c1..6ef89a4 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -disable-machine-sink=1 - < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -disable-machine-sink=1 - < %s | FileCheck -check-prefix=GFX10 %s
define float @fold_abs_in_branch(float %arg1, float %arg2) {
; GFX10-LABEL: fold_abs_in_branch:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-fmaak-bug.ll b/llvm/test/CodeGen/AMDGPU/fold-fmaak-bug.ll
index 4edf4c4..a5d9996 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-fmaak-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-fmaak-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix GFX10
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float)
declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll b/llvm/test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll
index 8401e04..433d770 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}fold_mul_neg:
; GCN: load_dword [[V:v[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 63ba18a..f09c257 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
declare i16 @llvm.umax.i16(i16, i16)
declare i64 @llvm.umin.i64(i64, i64)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index 200f74b..4b800e4 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
declare float @llvm.fabs.f32(float) #1
declare double @llvm.fabs.f64(double) #1
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
index f41eead..db938d6 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
@@ -1,17 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=SI
-; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=GFX7
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1100
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=SI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefix=GFX7
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=GFX10
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=GFX1030
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefix=GFX1100
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX7
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1100
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefix=G_GFX1100
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12
declare float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float, <4 x i32>, i32, i32, i32 immarg)
declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
index fa5e2c7..ca7e2e9 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
@@ -1,15 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=SI
-; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=GFX7
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1100
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=SI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefix=GFX7
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=GFX10
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=GFX1030
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefix=GFX1100
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX7
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1100
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefix=G_GFX1100
declare float @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f32(float, ptr addrspace(8), i32, i32, i32 immarg)
declare float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-image-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-image-atomics.ll
index d483364..37d0e54 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-image-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-image-atomics.ll
@@ -1,15 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=SI
-; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=GFX7
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030
-; RUN: not --crash llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX11-ERR
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=SI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefix=GFX7
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=GFX10
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=GFX1030
+; RUN: not --crash llc < %s -mtriple=amdgcn -mcpu=gfx1100 2>&1 | FileCheck %s -check-prefix=GFX11-ERR
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX7
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030
-; RUN: not llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=G_GFX11-ERR
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
+; RUN: not llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 2>&1 | FileCheck %s -check-prefix=G_GFX11-ERR
; GFX11-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.image.atomic.f
; G_GFX11-ERR: LLVM ERROR: cannot select: {{.*}} = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.f
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll
index a0119a2..874aa54 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
-; RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
+; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
+; RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
declare float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data)
declare float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll
index a55c3d8..d525058 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
-; RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
+; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
+; RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
declare float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
declare float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
index c359b84..42451f9 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=CYPRESS %s
-; RUN: llc -mtriple=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefixes=CAYMAN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=CYPRESS %s
+; RUN: llc -mtriple=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=CAYMAN %s
declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
index 2520e6b..5849f0c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 520390c..fc3aaab 100644
--- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=CYPRESS %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=CYPRESS %s
declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
index 0ee9a21..2d38924 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=SI
-; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=GFX7
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=SI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefix=GFX7
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=GFX10
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=GFX1030
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX7
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg)
declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
index f9e5e3a..b8363da 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=SI
-; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=GFX7
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=SI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefix=GFX7
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=GFX10
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=GFX1030
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX7
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32 immarg)
declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
index 5f76c54..12b60be 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare double @llvm.fabs.f64(double) #1
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index 162bf52..7ab8b30 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s --check-prefixes=SI
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s --check-prefixes=VI
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s --check-prefixes=SI
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s --check-prefixes=VI
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck %s --check-prefixes=EG
declare float @llvm.fabs.f32(float) #1
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
index c3b4e6f..7df6e81 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare double @llvm.fabs.f64(double) #1
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index f4a1301..5428ba8 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s -check-prefixes=SI
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s -check-prefixes=VI
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
declare float @llvm.fabs.f32(float) #1
diff --git a/llvm/test/CodeGen/AMDGPU/fpext-free.ll b/llvm/test/CodeGen/AMDGPU/fpext-free.ll
index d234374..b88cb21 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext-free.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext-free.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16,GFX11-F32FLUSH,GFX11-F32FLUSH-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16,GFX11-F32FLUSH,GFX11-F32FLUSH-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16,GFX11-F32DENORM,GFX11-F32DENORM-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16,GFX11-F32DENORM,GFX11-F32DENORM-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9-F32FLUSH %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9-F32DENORM %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16,GFX11-F32FLUSH,GFX11-F32FLUSH-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16,GFX11-F32FLUSH,GFX11-F32FLUSH-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16,GFX11-F32DENORM,GFX11-F32DENORM-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16,GFX11-F32DENORM,GFX11-F32DENORM-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9-F32FLUSH %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9-F32DENORM %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s
; fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
define float @fadd_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index fa358c9..d41e2c6 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @fpext_f16_to_f32(
; SI-LABEL: fpext_f16_to_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.ll b/llvm/test/CodeGen/AMDGPU/fpext.ll
index 964f0c1..5b45d01 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}fpext_f32_to_f64:
; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 97a94ed..f048dc5 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @fptosi_f16_to_i16(
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index 72ddc32..96abb3a 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @fptoui_f16_to_i16(
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 0a900f90..d0b41e1 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=0 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=1 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=0 -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=1 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
define amdgpu_kernel void @fptrunc_f32_to_f16(
; SI-SDAG-LABEL: fptrunc_f32_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 49c563e..2bd3659 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -1,19 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-SDAG,VI-SAFE-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-GISEL,VI-SAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-GISEL,VI-UNSAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-SAFE-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-SAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-UNSAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SAFE-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-SAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-SAFE-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-GISEL,VI-SAFE-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-GISEL,VI-UNSAFE-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-SAFE-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-SAFE-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-UNSAFE-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SAFE-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-SAFE-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s
define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) {
; SI-LABEL: fptrunc_f64_to_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fract.f64.ll b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
index 1fae997..f09c1c6 100644
--- a/llvm/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
@@ -1,9 +1,9 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
declare double @llvm.fabs.f64(double) #0
declare double @llvm.floor.f64(double) #0
diff --git a/llvm/test/CodeGen/AMDGPU/fract.ll b/llvm/test/CodeGen/AMDGPU/fract.ll
index bc6ec96..8ef0fcf 100644
--- a/llvm/test/CodeGen/AMDGPU/fract.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract.ll
@@ -1,8 +1,8 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s
declare float @llvm.fabs.f32(float) #0
declare float @llvm.floor.f32(float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 40cff44..15cda62 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca,+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca,+enable-flat-scratch < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11-FAKE16 %s
diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 13884eb..2e88da1 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-spill-sgpr-to-vgpr=true < %s | FileCheck -check-prefix=SPILL-TO-VGPR %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-spill-sgpr-to-vgpr=false < %s | FileCheck -check-prefix=NO-SPILL-TO-VGPR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-spill-sgpr-to-vgpr=true < %s | FileCheck -check-prefix=SPILL-TO-VGPR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-spill-sgpr-to-vgpr=false < %s | FileCheck -check-prefix=NO-SPILL-TO-VGPR %s
; Check frame setup where SGPR spills to VGPRs are disabled or enabled.
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 6fb64a9..0df1a0f 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -1,15 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1200,GFX1200-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1200,GFX1200-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mattr=+mad-mac-f32-insts < %s | FileCheck --check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1200,GFX1200-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1200,GFX1200-FAKE16 %s
define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; SI-LABEL: frem_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 43caa4c7..ed1ee45 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9
-; RUN: llc < %s -mtriple=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone
declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 4a79096..b35b553 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9
-; RUN: llc < %s -mtriple=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti | FileCheck %s -check-prefixes=GFX89,SI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX89,VI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX89,GFX9
+; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX12,GFX12-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
declare i32 @llvm.fshr.i32(i32, i32, i32)
declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 42f0985..8f3b9a5 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=pitcairn < %s | FileCheck -check-prefixes=SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=pitcairn < %s | FileCheck -check-prefixes=GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GISEL %s
define double @v_sqrt_f64(double %x) {
; GISEL-LABEL: v_sqrt_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index a764681..b8b3399 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck --check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX89,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @fsub_f16(
; SI-LABEL: fsub_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.ll b/llvm/test/CodeGen/AMDGPU/fsub.ll
index 9c00df9..743431c 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}v_fsub_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fsub64.ll b/llvm/test/CodeGen/AMDGPU/fsub64.ll
index dd2c874..29af861 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
declare double @llvm.fabs.f64(double) #0
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index f8ff8ef..95e28a3 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,GFX89,VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,GFX89,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,GFX89,VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,GFX89,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
define void @void_func_i1(i1 %arg0) #0 {
; CIGFX89-LABEL: void_func_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 658c45c..38003f6 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn-- -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,CI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,GFX89,GFX8 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,GFX89,GFX9 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,CI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,GFX89,GFX8 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,GFX89,GFX9 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
define i1 @i1_func_void() #0 {
; GFX789-LABEL: i1_func_void:
diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
index fc3915f..0658997 100644
--- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
+++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefix=GCN %s
define amdgpu_kernel void @divergent_or3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_or3_b32:
diff --git a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
index f787a40..ca75874 100644
--- a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx9-generic --amdhsa-code-object-version=6 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx9-generic --amdhsa-code-object-version=6 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefix=GCN %s
@gds0 = internal addrspace(2) global [4 x i32] poison, align 4
@lds0 = internal addrspace(3) global [4 x i32] poison, align 128
diff --git a/llvm/test/CodeGen/AMDGPU/gds-atomic.ll b/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
index 6f6ff96..d24355f 100644
--- a/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 < %s | FileCheck -check-prefixes=GCN,FUNC %s
; FUNC-LABEL: {{^}}atomic_add_ret_gds:
; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
diff --git a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
index a63b3be..df32959 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
define amdgpu_kernel void @use_gep_address_space(ptr addrspace(3) %array) nounwind {
; CHECK-LABEL: {{^}}use_gep_address_space:
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index 8ac187ea..9d137fb 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG -enable-var-scope %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG -enable-var-scope %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GISEL -enable-var-scope %s
declare void @extern_c_func()
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index facc91a..2fdc1a8 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10-SCRATCH %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX10-SCRATCH %s
declare hidden amdgpu_gfx void @external_void_func_i1(i1) #0
declare hidden amdgpu_gfx void @external_void_func_i1_signext(i1 signext) #0
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index 13fff02..124de7e 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -enable-ipra=0 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -enable-ipra=0 < %s | FileCheck --check-prefix=GFX11 %s
declare hidden amdgpu_gfx void @external_void_func_void() #0
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 6682198..5c183f5 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s
define amdgpu_gfx i1 @return_i1() #0 {
; GFX9-LABEL: return_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll b/llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll
index f416308..9dae6e0 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11 %s
; GCN-LABEL: {{^}}test_add_lit:
; GFX10PLUS: v_add_co_u32{{(_e64)?}} v{{[0-9]+}}, vcc_lo, 0x80992bff, v{{[0-9]+}}
@@ -63,4 +63,4 @@ define amdgpu_kernel void @test_bfe_2lit_v(ptr addrspace(1) %p) {
}
declare i32 @llvm.amdgcn.workitem.id.x()
-declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32) \ No newline at end of file
+declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index 737985c..acec0e7 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,DAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_s_load_i8:
diff --git a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
index f004c19..99690e4 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
; GFX9-DAG: buffer_load_format_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding:
; GFX9-DAG: buffer_load_format_d16_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding:
diff --git a/llvm/test/CodeGen/AMDGPU/global-address.ll b/llvm/test/CodeGen/AMDGPU/global-address.ll
index 60f4f0c..bcded52 100644
--- a/llvm/test/CodeGen/AMDGPU/global-address.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-address.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-PAL-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-PAL-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -< %s | FileCheck -check-prefix=GFX11-PAL-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -< %s | FileCheck -check-prefix=GFX11-PAL-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-PAL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-PAL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-HSA %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-HSA %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -< %s | FileCheck -check-prefix=GFX11-HSA %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -< %s | FileCheck -check-prefix=GFX11-HSA %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-HSA %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-HSA %s
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index 819b06e..c2ddce4 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s
define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) {
; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
index 39e9ccc..bd9fe39 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) {
; GFX90A_GFX942-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw
diff --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll
index c790187..866d3a1 100644
--- a/llvm/test/CodeGen/AMDGPU/global-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-PAL %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DEFAULT %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-MESA %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DEFAULT %s
-; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=R600 %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefixes=GCN,GCN-PAL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GCN-DEFAULT %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GCN-MESA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GCN-DEFAULT %s
+; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=R600 %s
@private1 = private unnamed_addr addrspace(4) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0]
@private2 = private unnamed_addr addrspace(4) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0]
diff --git a/llvm/test/CodeGen/AMDGPU/global-directive.ll b/llvm/test/CodeGen/AMDGPU/global-directive.ll
index ef5c3da4..ced9a13 100644
--- a/llvm/test/CodeGen/AMDGPU/global-directive.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-directive.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
; Make sure the GlobalDirective isn't merged with the function name
diff --git a/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll b/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
index bc2def2..ca84288 100644
--- a/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; XUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
index 8459743..f2da966 100644
--- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1030 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=GFX908 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030 %s
; Function Attrs: mustprogress nounwind willreturn
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
index e2d33df..6fe9e1d 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s
; The first load produces address in a VGPR which is used in address calculation
; of the second load (one inside the loop). The value is uniform and the inner
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index 5d35adc..fd644a3 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -304,78 +304,79 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4
; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32
; GCN-SDAG-NEXT: s_clause 0x7
-; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:112
-; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:96
-; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:80
+; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:112
+; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96
+; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80
; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:48
-; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off offset:32
-; GCN-SDAG-NEXT: global_load_b128 v[22:25], v[0:1], off offset:16
-; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off
+; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:32
+; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16
+; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off
; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[16:17], 0x70
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[24:25], 0x70
; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 0x60
; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 48
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x50
; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 32
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 64
; GCN-SDAG-NEXT: v_mov_b64_e32 v[40:41], 16
-; GCN-SDAG-NEXT: v_dual_mov_b32 v14, 0xc8 :: v_dual_mov_b32 v15, 0
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x50
; GCN-SDAG-NEXT: v_mov_b64_e32 v[42:43], 0
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 64
+; GCN-SDAG-NEXT: v_dual_mov_b32 v22, 0xc8 :: v_dual_mov_b32 v23, 0
; GCN-SDAG-NEXT: s_wait_loadcnt 0x7
-; GCN-SDAG-NEXT: global_store_b128 v[16:17], v[6:9], off
+; GCN-SDAG-NEXT: global_store_b128 v[24:25], v[10:13], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x6
-; GCN-SDAG-NEXT: global_store_b128 v[50:51], v[10:13], off
+; GCN-SDAG-NEXT: global_store_b128 v[50:51], v[18:21], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x5
; GCN-SDAG-NEXT: s_wait_xcnt 0x1
-; GCN-SDAG-NEXT: v_dual_mov_b32 v16, v20 :: v_dual_mov_b32 v17, v21
+; GCN-SDAG-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v9
; GCN-SDAG-NEXT: s_wait_xcnt 0x0
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[12:13], v[12:13], 0, v[12:13]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[10:11], v[10:11], 0, v[10:11]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[8:9], 0, v[8:9]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[6:7], v[6:7], 0, v[6:7]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
; GCN-SDAG-NEXT: s_wait_loadcnt 0x4
; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[34:37], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x3
-; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[30:33], off
+; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[14:17], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x2
-; GCN-SDAG-NEXT: global_store_b128 v[40:41], v[22:25], off
+; GCN-SDAG-NEXT: global_store_b128 v[40:41], v[26:29], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x1
-; GCN-SDAG-NEXT: global_store_b128 v[42:43], v[26:29], off
+; GCN-SDAG-NEXT: global_store_b128 v[42:43], v[30:33], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
; GCN-SDAG-NEXT: s_wait_xcnt 0x3
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[52:53], v[2:3], 0, v[2:3]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[50:51], v[0:1], 0, v[0:1]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[52:53], v[2:3], v[2:3]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[0:1], v[0:1]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7]
+; GCN-SDAG-NEXT: s_wait_xcnt 0x2
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[16:17], 0x64, v[16:17]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
; GCN-SDAG-NEXT: s_wait_xcnt 0x1
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[24:25], v[24:25], 0, v[24:25]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[22:23], v[22:23], 0, v[22:23]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27]
; GCN-SDAG-NEXT: s_wait_xcnt 0x0
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[28:29], v[28:29], 0, v[28:29]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[26:27], v[26:27], 0, v[26:27]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[36:37], v[36:37], 0, v[36:37]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[34:35], v[34:35], 0, v[34:35]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[32:33], v[32:33], 0, 0x64
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[30:31], v[30:31], 0, v[30:31]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[20:21], v[20:21], 0, v[20:21]
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[18:19], v[18:19], 0, 0xc8
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[36:37], v[36:37], v[36:37]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[34:35], v[34:35], v[34:35]
; GCN-SDAG-NEXT: s_clause 0x1
-; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[14:17], off
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[22:25], off
; GCN-SDAG-NEXT: global_store_b128 v[48:49], v[0:3], off
; GCN-SDAG-NEXT: s_clause 0x7
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:96
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:112
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:96
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:112
; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[50:53], off offset:64
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:80
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off offset:32
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:80
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:32
; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[34:37], off offset:48
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[22:25], off offset:16
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off offset:16
; GCN-SDAG-NEXT: s_clause 0x3
; GCN-SDAG-NEXT: scratch_load_b32 v43, off, s32
; GCN-SDAG-NEXT: scratch_load_b32 v42, off, s32 offset:4
; GCN-SDAG-NEXT: scratch_load_b32 v41, off, s32 offset:8
; GCN-SDAG-NEXT: scratch_load_b32 v40, off, s32 offset:12
; GCN-SDAG-NEXT: s_wait_xcnt 0xc
-; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v28 :: v_dual_mov_b32 v1, v29
+; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
@@ -403,11 +404,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: v_mov_b64_e32 v[48:49], 16
; GCN-GISEL-NEXT: v_mov_b64_e32 v[50:51], 32
; GCN-GISEL-NEXT: v_mov_b64_e32 v[52:53], 48
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x60
; GCN-GISEL-NEXT: v_mov_b64_e32 v[54:55], 64
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x70
; GCN-GISEL-NEXT: v_mov_b64_e32 v[34:35], 0xc8
; GCN-GISEL-NEXT: v_mov_b64_e32 v[40:41], 0x50
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x60
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x70
; GCN-GISEL-NEXT: s_wait_loadcnt 0x6
; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[10:13], off
; GCN-GISEL-NEXT: s_wait_loadcnt 0x5
@@ -422,28 +423,28 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: global_store_b128 v[44:45], v[30:33], off
; GCN-GISEL-NEXT: v_mov_b64_e32 v[36:37], v[8:9]
; GCN-GISEL-NEXT: s_wait_xcnt 0x5
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[10:11], v[10:11], 0, v[10:11]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[12:13], v[12:13], 0, v[12:13]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
; GCN-GISEL-NEXT: s_wait_xcnt 0x4
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[14:15], v[14:15], 0, v[14:15]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[16:17], v[16:17], 0, v[16:17]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17]
; GCN-GISEL-NEXT: s_wait_xcnt 0x3
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[18:19], v[18:19], 0, v[18:19]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[20:21], v[20:21], 0, 0x64
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[20:21], 0x64, v[20:21]
; GCN-GISEL-NEXT: s_wait_xcnt 0x2
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[22:23], v[22:23], 0, v[22:23]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[24:25], v[24:25], 0, v[24:25]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[22:23], v[22:23], v[22:23]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[24:25], v[24:25], v[24:25]
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[48:49], v[0:1], 0, v[0:1]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[50:51], v[2:3], 0, v[2:3]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[6:7], v[6:7], 0, 0xc8
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[8:9], v[8:9], 0, v[8:9]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
; GCN-GISEL-NEXT: s_wait_xcnt 0x1
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[26:27], v[26:27], 0, v[26:27]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[28:29], v[28:29], 0, v[28:29]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29]
; GCN-GISEL-NEXT: s_wait_xcnt 0x0
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[30:31], v[30:31], 0, v[30:31]
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[32:33], v[32:33], 0, v[32:33]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
; GCN-GISEL-NEXT: s_clause 0x1
; GCN-GISEL-NEXT: global_store_b128 v[54:55], v[0:3], off
; GCN-GISEL-NEXT: global_store_b128 v[40:41], v[34:37], off
@@ -482,17 +483,16 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
; GCN-SDAG-LABEL: test_v7i16_load_store_kernel:
; GCN-SDAG: ; %bb.0:
; GCN-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GCN-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GCN-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GCN-SDAG-NEXT: s_wait_xcnt 0x0
; GCN-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GCN-SDAG-NEXT: v_mov_b64_e32 v[8:9], 12
; GCN-SDAG-NEXT: v_mov_b64_e32 v[10:11], 8
; GCN-SDAG-NEXT: v_mov_b64_e32 v[12:13], 0
-; GCN-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: s_clause 0x1
-; GCN-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1]
-; GCN-SDAG-NEXT: global_load_b128 v[4:7], v4, s[2:3]
+; GCN-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset
+; GCN-SDAG-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
; GCN-SDAG-NEXT: v_pk_add_u16 v3, v3, v7
; GCN-SDAG-NEXT: v_pk_add_u16 v2, v2, v6
@@ -509,21 +509,20 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
; GCN-GISEL-LABEL: test_v7i16_load_store_kernel:
; GCN-GISEL: ; %bb.0:
; GCN-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GCN-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GCN-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GCN-GISEL-NEXT: s_wait_xcnt 0x0
; GCN-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GCN-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0
; GCN-GISEL-NEXT: v_mov_b64_e32 v[10:11], 2
; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 4
-; GCN-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 6
; GCN-GISEL-NEXT: v_mov_b64_e32 v[16:17], 8
; GCN-GISEL-NEXT: v_mov_b64_e32 v[18:19], 10
; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 12
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: s_clause 0x1
-; GCN-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1]
-; GCN-GISEL-NEXT: global_load_b128 v[4:7], v4, s[2:3]
+; GCN-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset
+; GCN-GISEL-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
; GCN-GISEL-NEXT: v_pk_add_u16 v0, v0, v4
; GCN-GISEL-NEXT: v_pk_add_u16 v1, v1, v5
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 0512b9b..2aa198f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-atomic-optimizer-strategy=None -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-atomic-optimizer-strategy=None -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-atomic-optimizer-strategy=None -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-atomic-optimizer-strategy=None -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: atomic_add_i32_offset:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index f7882e6..a867c6c1a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
; ---------------------------------------------------------------------
; atomicrmw xchg
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 55a2dd0..778fc2e 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) {
; CI-LABEL: atomic_add_i64_offset:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index 59a99a6..a7f1644 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
; ---------------------------------------------------------------------
; atomicrmw xchg
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index c8b24f7..6351bb3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32, -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32, -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
declare float @div.float.value()
declare float @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 4fccfc0..a9ac008 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
declare float @div.float.value()
declare float @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index bb119eb..6311143 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
declare float @div.float.value()
declare double @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll
index d590baa..69f9311 100644
--- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true < %s | FileCheck %s
; uniform loads
; CHECK-LABEL: @uniform_load
diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll b/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
index 670666b..2a39b3d 100644
--- a/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true < %s | FileCheck %s
; CHECK-LABEL: %bb22
@@ -75,12 +75,12 @@ bb22: ; preds = %bb20, %bb11
}
; one more test to ensure that aliasing store after the load
-; is considered clobbering if load parent block is the same
+; is considered clobbering if load parent block is the same
; as a loop header block.
; CHECK-LABEL: %bb1
-; Load from %arg has alias store that is after the load
+; Load from %arg has alias store that is after the load
; but is considered clobbering because of the loop.
; CHECK: flat_load_dword
diff --git a/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll
index f4c03fb..4d24c84 100644
--- a/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=cayman < %s | FileCheck --check-prefixes=EG,FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 117cf40..8e427a6 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=CIVI,CI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=CIVI,VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=CIVI,CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=CIVI,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
; half args should be promoted to float for CI and lower.
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
new file mode 100644
index 0000000..8007597
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
@@ -0,0 +1,33 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12
+
+---
+name: flat_prefetch_flat_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX12-LABEL: name: flat_prefetch_flat_load
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+name: global_prefetch_flat_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX12-LABEL: name: global_prefetch_flat_load
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
index 9a9fd36..8bd6c0f 100644
--- a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
+++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -o - %s | FileCheck %s
@lds0 = addrspace(3) global [512 x float] poison
@lds1 = addrspace(3) global [256 x float] poison
diff --git a/llvm/test/CodeGen/AMDGPU/hoist-cond.ll b/llvm/test/CodeGen/AMDGPU/hoist-cond.ll
index 830a40f..f4abe2d 100644
--- a/llvm/test/CodeGen/AMDGPU/hoist-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/hoist-cond.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -disable-block-placement < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -disable-block-placement < %s | FileCheck %s
; Check that invariant compare is hoisted out of the loop.
; At the same time condition shall not be serialized into a VGPR and deserialized later
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll
index af7b57a..c24c3f8 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_default_ci:
; GCN: .amdhsa_dx10_clamp 1
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
index 380a8e9..74eb3a7 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX801 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx801 < %s | FileCheck -check-prefixes=CHECK,GFX801 %s
; COM: Adapted from agpr-register-count.ll
; COM: GFX900 and below should not have .agpr_count present in the metadata
diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
index ec6c80e..25bf022 100644
--- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
@@ -1,10 +1,10 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
; GCN: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
index 681a603..fe462fb 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -check-prefix=SI %s
define amdgpu_ps void @i1_copy_from_loop(ptr addrspace(8) inreg %rsrc, i32 %tid) {
; SI-LABEL: i1_copy_from_loop:
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
index cd0a15e..8d780d3 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
; SILowerI1Copies was not handling IMPLICIT_DEF
; SI-LABEL: {{^}}br_poison:
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll
index 856601e..09e0572 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_dont_clobber_scc:
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
index 68994f5..8e5b89e 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
; SI-LABEL: {{^}}br_i1_phi:
diff --git a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
index f9dcd92..fc4cdcd 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s| FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s| FileCheck -check-prefix=GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s| FileCheck -check-prefix=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn < %s| FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s| FileCheck -check-prefix=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s| FileCheck -check-prefix=GFX11-TRUE16 %s
;;;==========================================================================;;;
;; 16-bit integer comparisons
diff --git a/llvm/test/CodeGen/AMDGPU/icmp64.ll b/llvm/test/CodeGen/AMDGPU/icmp64.ll
index c2f00f8..fb477c0 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,VI %s
; GCN-LABEL: {{^}}test_i64_eq:
; VI: s_cmp_eq_u64
diff --git a/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
index 28aa76a..b68d74b 100644
--- a/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand < %s | FileCheck --check-prefix=OPT %s
define i32 @global_agent_monotonic_idempotent_or(ptr addrspace(1) %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index ecbf5df..835818f 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GFX9-LABEL: udiv32_invariant_denom:
diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
index b1bfd54..f9c679d 100644
--- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-UNPACKED %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga < %s | FileCheck -check-prefix=GFX8-UNPACKED %s
define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-LABEL: load_1d_f16_tfe_dmask0:
diff --git a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll
index 5cb9721..3206e95 100644
--- a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GCN %s
declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/image-schedule.ll b/llvm/test/CodeGen/AMDGPU/image-schedule.ll
index 09e819d..9c44b7b 100644
--- a/llvm/test/CodeGen/AMDGPU/image-schedule.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-schedule.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefixes=GCN %s
; The first image store and the second image load use the same descriptor and
; the same coordinate. Check that they do not get swapped by the machine
diff --git a/llvm/test/CodeGen/AMDGPU/img-nouse-adjust.ll b/llvm/test/CodeGen/AMDGPU/img-nouse-adjust.ll
index 31be0ab..b2d9a88 100644
--- a/llvm/test/CodeGen/AMDGPU/img-nouse-adjust.ll
+++ b/llvm/test/CodeGen/AMDGPU/img-nouse-adjust.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -start-before=amdgpu-isel -stop-after=amdgpu-isel -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -start-before=amdgpu-isel -stop-after=amdgpu-isel < %s | FileCheck %s --check-prefix=GCN
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -start-before=amdgpu-isel -stop-after=amdgpu-isel -enable-new-pm < %s | FileCheck %s --check-prefix=GCN
; We're really just checking for no crashes
@@ -18,6 +18,6 @@ define amdgpu_cs void @_amdgpu_cs_main(i32 %dummy) local_unnamed_addr #0 {
; Function Attrs: nounwind readonly
declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
-
+
attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
index a328bbe..58cfd40 100644
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
; Use a 64-bit value with lo bits that can be represented as an inline constant
define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll
index 8ca8767..676773a 100644
--- a/llvm/test/CodeGen/AMDGPU/imm16.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -mattr=-flat-for-global -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -mattr=-flat-for-global -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -mattr=-flat-for-global -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=-flat-for-global -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
; FIXME: Merge into imm.ll
diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll
index 342d7b0..d1315cd 100644
--- a/llvm/test/CodeGen/AMDGPU/immv216.ll
+++ b/llvm/test/CodeGen/AMDGPU/immv216.ll
@@ -1,8 +1,8 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-flat-for-global -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global,-xnack -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global,-xnack -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global,-xnack -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
; FIXME: Merge into imm.ll
; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-def-muse.ll b/llvm/test/CodeGen/AMDGPU/implicit-def-muse.ll
index 495e8a2..5392bff 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-def-muse.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-def-muse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -stop-after=amdgpu-isel -o - %s | FileCheck %s
; RUN: llc -mtriple=amdgcn -stop-after=amdgpu-isel -enable-new-pm -o - %s | FileCheck %s
; CHECK-LABEL: vcopy_i1_undef
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
index 872a457..8835d0c 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
@@ -1,4 +1,4 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s
; indexing of vectors.
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
index 3964207..98658de 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -O0 -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -stop-after=regallocfast < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -O0 -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -stop-after=regallocfast < %s | FileCheck -check-prefixes=GCN %s
; Verify that we consider the xor at the end of the waterfall loop emitted for
; divergent indirect addressing as a terminator.
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index d7c4f6a..a208cfd 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel < %s | FileCheck -check-prefix=GISEL %s
@gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
@gv.fptr1 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
index 40cb061..97a7925 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck --check-prefixes=SI-ALLOCA16,SI %s
-; RUN: llc -mtriple=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck --check-prefixes=SI-ALLOCA4,SI %s
-; RUN: llc -mtriple=amdgcn -mattr=+promote-alloca -disable-promote-alloca-to-vector -verify-machineinstrs < %s | FileCheck --check-prefixes=SI-PROMOTE,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+promote-alloca -disable-promote-alloca-to-vector -verify-machineinstrs < %s | FileCheck --check-prefixes=CI-PROMOTE,SI %s
+; RUN: llc -mtriple=amdgcn -mattr=-promote-alloca,+max-private-element-size-16 < %s | FileCheck --check-prefixes=SI-ALLOCA16,SI %s
+; RUN: llc -mtriple=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 < %s | FileCheck --check-prefixes=SI-ALLOCA4,SI %s
+; RUN: llc -mtriple=amdgcn -mattr=+promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=SI-PROMOTE,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca,+max-private-element-size-16 < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=CI-PROMOTE,SI %s
declare void @llvm.amdgcn.s.barrier() #0
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index bea532b..3e2e43f 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s
; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
define amdgpu_kernel void @infinite_loop(ptr addrspace(1) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
index cf15466..c7767cb8 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
@@ -16,6 +16,14 @@
ret void
}
+ define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2() #0 {
+ ret void
+ }
+
+ define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg() #0 {
+ ret void
+ }
+
attributes #0 = { "amdgpu-wave-limiter"="true" "amdgpu-waves-per-eu"="8,8" }
...
@@ -311,3 +319,173 @@ body: |
$agpr0 = COPY %0
...
+
+# Non-mac variant, src2 is an immediate.
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: early-clobber renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+# Non-mac variant, src2 is the same VGPR, but a different subregister.
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_1024_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_1024_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ undef %0.sub0_sub1:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %0.sub16_sub17:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31, 0, 0, 0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index 8718401..b907c13 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -970,3 +970,93 @@ body: |
S_ENDPGM 0
...
+
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_same_subreg
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_same_subreg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: renamable $vgpr11 = COPY renamable $vgpr10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr14_vgpr15_vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_1024_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_1024_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ %0.sub0_sub1_sub2_sub3:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %0.sub4_sub5_sub6_sub7:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %0.sub8_sub9_sub10_sub11:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %0.sub12_sub13_sub14_sub15:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, 0, 0, 0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
index dce4162..adb31f5 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o /dev/null 2>&1 %s | FileCheck -check-prefix=ERR %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o /dev/null 2>&1 %s | FileCheck -check-prefix=ERR %s
; ERR: warning: inline asm clobber list contains reserved registers: v42
; ERR: note: Reserved registers on the clobber list may not be preserved across the asm statement, and clobbering them may lead to undefined behaviour.
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index 74cdf15..54e7d0e 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=CHECK %s
; CHECK-LABEL: {{^}}inline_asm:
; CHECK: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/inline-calls.ll b/llvm/test/CodeGen/AMDGPU/inline-calls.ll
index e1cdfa8..de65b2e 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-calls.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple amdgcn-unknown-linux-gnu -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple amdgcn-unknown-linux-gnu -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple r600-unknown-linux-gnu -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s --check-prefix=R600
+; RUN: llc -mtriple amdgcn-unknown-linux-gnu -mcpu=tahiti < %s | FileCheck %s
+; RUN: llc -mtriple amdgcn-unknown-linux-gnu -mcpu=tonga < %s | FileCheck %s
+; RUN: llc -mtriple r600-unknown-linux-gnu -mcpu=redwood < %s | FileCheck %s --check-prefix=R600
; ALL-NOT: {{^}}func:
define internal i32 @func(i32 %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
index 3aa6f3a..15e570b 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -1,8 +1,8 @@
-; RUN: not llc < %s -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-; RUN: not llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=VI %s
+; RUN: not llc < %s -mtriple=amdgcn -mcpu=bonaire | FileCheck --check-prefix=GCN %s
+; RUN: not llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck --check-prefix=GCN --check-prefix=VI %s
-; RUN: not llc < %s -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs 2>&1 | FileCheck --check-prefix=NOGCN --check-prefix=NOSI %s
-; RUN: not llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs 2>&1 | FileCheck --check-prefix=NOGCN %s
+; RUN: not llc < %s -mtriple=amdgcn -mcpu=bonaire 2>&1 | FileCheck --check-prefix=NOGCN --check-prefix=NOSI %s
+; RUN: not llc < %s -mtriple=amdgcn -mcpu=tonga 2>&1 | FileCheck --check-prefix=NOGCN %s
; GCN-LABEL: {{^}}inline_reg_constraints:
; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-16.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-16.ll
index 5bd116d..2aadb03 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-16.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s 2>&1 | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s 2>&1 | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s 2>&1 | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck -enable-var-scope -check-prefix=GCN %s
; GCN-LABEL: {{^}}s_input_output_i16:
; GCN: s_mov_b32 s[[REG:[0-9]+]], -1
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
index 93b2a25..9f7f228 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
@@ -1,6 +1,6 @@
-; RUN: not llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=SICI %s
-; RUN: not llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN %s
-; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=SICI %s
+; RUN: not llc -mtriple=amdgcn -mcpu=bonaire < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=SICI %s
+; RUN: not llc -mtriple=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck -check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=SICI %s
; GCN: error: couldn't allocate output register for constraint 's'
; GCN: error: couldn't allocate input reg for constraint 's'
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll
index 807a7d2..007c3f6 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}inline_asm_input_v2i16:
; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll
index 24bd8b4..1a2fa1d 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-; RUN: not llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s 2>&1 | FileCheck -enable-var-scope -check-prefixes=INVALID %s
-; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s 2>&1 | FileCheck -enable-var-scope -check-prefixes=INVALID %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: not llc -mtriple=amdgcn -mcpu=bonaire < %s 2>&1 | FileCheck -enable-var-scope -check-prefixes=INVALID %s
+; RUN: not llc -mtriple=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck -enable-var-scope -check-prefixes=INVALID %s
; GCN-LABEL: {{^}}s_input_output_v8f16
; GCN: s_mov_b32 s[0:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 9389f16..eb5c5ef 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 %s -o - | FileCheck %s -check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 %s -o - | FileCheck %s -check-prefixes=GFX11
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
index 0623110..d6e75d0 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple amdgcn-amd-- -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple amdgcn-amd-- -mcpu=bonaire < %s | FileCheck -check-prefix=GCN %s
; Before the fix that this test was committed with, this code would leave
; an unused stack slot, causing ScratchSize to be non-zero.
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index 1f51838..fb075221 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -stop-after=si-insert-waitcnts -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -stop-after=si-insert-waitcnts < %s | FileCheck %s
declare fastcc void @bar()
diff --git a/llvm/test/CodeGen/AMDGPU/insert_subreg.ll b/llvm/test/CodeGen/AMDGPU/insert_subreg.ll
index ed2d27c..e00ff00 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_subreg.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s
; Test that INSERT_SUBREG instructions don't have non-register operands after
; instruction selection.
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index 47a371d8..1ac75d3 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
; SI-LABEL: s_insertelement_v2bf16_0:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index c947d69..2585167 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CIVI,VI %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CIVI,VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
; GFX9-LABEL: s_insertelement_v2i16_0:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
index 80ed831..bbd9f3a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89 %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
index c63fe3d..45dbb88 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global -denormal-fp-math=preserve-sign -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global -denormal-fp-math=preserve-sign -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89 %s
; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
index 6ad2ed3..6815050 100644
--- a/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=GCN %s
; GatherAllAliases gives up on trying to analyze cases where the
; pointer may have been loaded from an aliased store, so make sure
diff --git a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
index ebd1540..b8f7d18 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -enable-ipra=1 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -enable-ipra=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=1 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 < %s | FileCheck -check-prefix=GCN %s
; This test is to make sure the return address registers, if clobbered in the
; function or the function has calls, are save/restored when IPRA is enabled/disabled.
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 464cd82..1e3678d 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -enable-ipra < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
; Kernels are not called, so there is no call preserved mask.
; GCN-LABEL: {{^}}kernel:
diff --git a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
new file mode 100644
index 0000000..8fc5afb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=irtranslator < %s | FileCheck %s
+
+define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
+ ; CHECK-LABEL: name: basic_test
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[C]]
+ ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[C1]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0
+ ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+ %x = select i1 %active, i32 %a, i32 5
+ %y = select i1 %active, i32 %b, i32 3
+ %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i32 %ret
+}
+
+; Make sure we don't crash if %active is not used at all.
+define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) {
+ ; CHECK-LABEL: name: unused_active
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+ ; CHECK-NEXT: $vgpr0 = COPY [[C]](s32)
+ ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+ ret i32 14
+}
+
+define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
+ ; CHECK-LABEL: name: multiple_blocks
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s1), [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[ICMP]](s1)
+ ; CHECK-NEXT: G_BRCOND [[INT]](s1), %bb.2
+ ; CHECK-NEXT: G_BR %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.if.then:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3.if.end:
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[COPY1]](s32), %bb.1, [[ADD]](s32), %bb.2
+ ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s32)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[PHI]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32)
+ ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+ %c = icmp eq i32 %a, %b
+ br i1 %c, label %if.then, label %if.end
+
+if.then: ; preds = %0
+ %d = add i32 %a, %b
+ br label %if.end
+
+if.end:
+ %f = phi i32 [ %d, %if.then ], [ %b, %0 ]
+ %e = select i1 %active, i32 %a, i32 %f
+ ret i32 %e
+}
+
+define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
+ ; CHECK-LABEL: name: ret_64
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV]], [[C]]
+ ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV1]], [[C1]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s64), [[SELECT1]](s64), 1, 1, 1, 0
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INTRINSIC_CONVERGENT]](s64)
+ ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0, implicit $vgpr1
+ %x = select i1 %active, i64 %a, i64 5
+ %y = select i1 %active, i64 %b, i64 3
+ %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
index 0e5ce9d..b15ddc9 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -stop-after=finalize-isel < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -stop-after=finalize-isel < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
index 49243fb..57b865d 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
index befe0d4..a873c01 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
; Test formal argument lowering as well as calls to amdgpu_gfx functions.
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-intrinsic-dyn-vgpr-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-intrinsic-dyn-vgpr-w32.ll
index 4e040748..9fe26ec 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-intrinsic-dyn-vgpr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-intrinsic-dyn-vgpr-w32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX12 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck -check-prefix=DAGISEL-GFX12 %s
declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index 3261e4c..ab99defc 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-TRUE16 %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-FAKE16 %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-TRUE16 %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-FAKE16 %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF32,DAGISEL-GFX11-WF32-TRUE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF32,DAGISEL-GFX11-WF32-FAKE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF64,DAGISEL-GFX11-WF64-TRUE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF64,DAGISEL-GFX11-WF64-FAKE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-TRUE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-FAKE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-TRUE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-FAKE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF32,DAGISEL-GFX11-WF32-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF32,DAGISEL-GFX11-WF32-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF64,DAGISEL-GFX11-WF64-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF64,DAGISEL-GFX11-WF64-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s
; We only care about which physical registers the parameters are copied from;
; the function bodies are just some arbitrary uses.
diff --git a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
new file mode 100644
index 0000000..3450d63
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=DAGISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GISEL %s
+
+define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
+ ; DAGISEL-LABEL: name: basic_test
+ ; DAGISEL: bb.0 (%ir-block.0):
+ ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; DAGISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec
+ ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]]
+ ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+ ;
+ ; GISEL-LABEL: name: basic_test
+ ; GISEL: bb.1 (%ir-block.0):
+ ; GISEL-NEXT: liveins: $vgpr0, $vgpr1
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+ ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+ ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY3]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec
+ ; GISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]]
+ ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+ %x = select i1 %active, i32 %a, i32 5
+ %y = select i1 %active, i32 %b, i32 3
+ %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i32 %ret
+}
+
+; Make sure we don't crash if %active is not used at all.
+define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) {
+ ; DAGISEL-LABEL: name: unused_active
+ ; DAGISEL: bb.0 (%ir-block.0):
+ ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; DAGISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 14, implicit $exec
+ ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]]
+ ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+ ;
+ ; GISEL-LABEL: name: unused_active
+ ; GISEL: bb.1 (%ir-block.0):
+ ; GISEL-NEXT: liveins: $vgpr0, $vgpr1
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14
+ ; GISEL-NEXT: $vgpr0 = COPY [[S_MOV_B32_]]
+ ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+ ret i32 14
+}
+
+define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
+ ; DAGISEL-LABEL: name: multiple_blocks
+ ; DAGISEL: bb.0 (%ir-block.0):
+ ; DAGISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]]
+ ; DAGISEL-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; DAGISEL-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; DAGISEL-NEXT: S_BRANCH %bb.1
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: bb.1.if.then:
+ ; DAGISEL-NEXT: successors: %bb.2(0x80000000)
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], [[COPY]], 0, implicit $exec
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: bb.2.if.end:
+ ; DAGISEL-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[V_ADD_U32_e64_]], %bb.1
+ ; DAGISEL-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]]
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY1]], [[COPY3]], implicit $exec
+ ; DAGISEL-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+ ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+ ;
+ ; GISEL-LABEL: name: multiple_blocks
+ ; GISEL: bb.1 (%ir-block.0):
+ ; GISEL-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GISEL-NEXT: liveins: $vgpr0, $vgpr1
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; GISEL-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GISEL-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GISEL-NEXT: S_BRANCH %bb.2
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: bb.2.if.then:
+ ; GISEL-NEXT: successors: %bb.3(0x80000000)
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: bb.3.if.end:
+ ; GISEL-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.1, [[V_ADD_U32_e64_]], %bb.2
+ ; GISEL-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+ ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+ %c = icmp eq i32 %a, %b
+ br i1 %c, label %if.then, label %if.end
+
+if.then: ; preds = %0
+ %d = add i32 %a, %b
+ br label %if.end
+
+if.end:
+ %f = phi i32 [ %d, %if.then ], [ %b, %0 ]
+ %e = select i1 %active, i32 %a, i32 %f
+ ret i32 %e
+}
+
+define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
+ ; DAGISEL-LABEL: name: ret_64
+ ; DAGISEL: bb.0 (%ir-block.0):
+ ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; DAGISEL-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; DAGISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]]
+ ; DAGISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY5]], [[COPY4]], implicit $exec
+ ; DAGISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, killed [[COPY6]], [[COPY4]], implicit $exec
+ ; DAGISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY7]], [[COPY4]], implicit $exec
+ ; DAGISEL-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+ ; DAGISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[COPY8]], [[COPY4]], implicit $exec
+ ; DAGISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], killed [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec
+ ; DAGISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec
+ ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]]
+ ; DAGISEL-NEXT: $vgpr1 = COPY [[V_MOV_B32_dpp1]]
+ ; DAGISEL-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GISEL-LABEL: name: ret_64
+ ; GISEL: bb.1 (%ir-block.0):
+ ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_1]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_2]], 0, [[COPY2]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_3]], 0, [[COPY3]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec
+ ; GISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]]
+ ; GISEL-NEXT: $vgpr1 = COPY [[V_MOV_B32_dpp1]]
+ ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1
+ %x = select i1 %active, i64 %a, i64 5
+ %y = select i1 %active, i64 %b, i64 3
+ %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i64 %ret
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
index 2053ae9..0d3f342 100644
--- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GISEL %s
; Check for verifier error due to trying to save and restore SCC
; around a waterfall looop when it was never defined. We have to get
diff --git a/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll b/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll
index 7caa563..96ca13f 100644
--- a/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck %s
; Check for verifier error after tail duplication. An implicit_def of
; a subregsiter is needed to maintain liveness after assignment.
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
index 6f61179..039ae1b 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn < %s | FileCheck %s
; Test that the alignment of kernel arguments does not impact the
; alignment of the stack
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a18b5b5..bad2e60 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=SI %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc < %s -mtriple=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -check-prefixes=EGCM,EG %s
-; RUN: llc < %s -mtriple=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -check-prefixes=EGCM,CM %s
+; RUN: llc < %s -mtriple=amdgcn | FileCheck -check-prefixes=SI %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=VI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck -check-prefixes=EGCM,EG %s
+; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck -check-prefixes=EGCM,CM %s
define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind {
; SI-LABEL: i8_arg:
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
index f1fc1a2..9601162 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HSA-VI,FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HSA-VI,FUNC %s
; Repeat of some problematic tests in kernel-args.ll, with the IR
; argument lowering pass disabled. Struct padding needs to be
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index 0681263..e8edf39 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 %s -o - | FileCheck %s
; The forced spill to preserve the scratch VGPR require the voffset to hold the large offset
; value in the MUBUF instruction being emitted before s_cbranch_scc1 as it clobbers the SCC.
diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
index 0a70734..684e3257 100644
--- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope %s
; Although it's modeled without any control flow in order to get better code
; out of the structurizer, @llvm.amdgcn.kill actually ends the thread that calls
diff --git a/llvm/test/CodeGen/AMDGPU/known-never-nan.ll b/llvm/test/CodeGen/AMDGPU/known-never-nan.ll
index 34aecd7..dc19c48 100644
--- a/llvm/test/CodeGen/AMDGPU/known-never-nan.ll
+++ b/llvm/test/CodeGen/AMDGPU/known-never-nan.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck %s
define half @known_nnan_extract_vector_elt(float %a, float %b, i32 %idx, half %c) {
; CHECK-LABEL: known_nnan_extract_vector_elt:
diff --git a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
index 64948c3..5691fc8 100644
--- a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
+++ b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; Mostly overlaps with fmed3.ll to stress specific cases of
; isKnownNeverSNaN.
diff --git a/llvm/test/CodeGen/AMDGPU/lds-bounds.ll b/llvm/test/CodeGen/AMDGPU/lds-bounds.ll
index c7307cc..e732f22 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-bounds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-bounds.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOSI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,NOSI %s
@compute_lds = external addrspace(3) global [512 x i32], align 16
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 04abb75..48bf7fb 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck %s
; LDS is allocated per-kernel. Module scope variables are gathered into a struct which is
; allocated at address zero, if used by the kernel. Kernel scope variables are gathered into
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index e64ec99..c776b19 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -15,10 +15,10 @@
; we emit a trap. The s_endpgm needs to be emitted in a terminator
; position.
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s 2> %t | FileCheck -check-prefixes=CHECK,SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s 2> %t | FileCheck -check-prefixes=CHECK,SDAG %s
; RUN: FileCheck -check-prefix=ERR %s < %t
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s 2> %t | FileCheck -check-prefixes=CHECK,GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s 2> %t | FileCheck -check-prefixes=CHECK,GISEL %s
; RUN: FileCheck -check-prefix=ERR %s < %t
diff --git a/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll b/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
index 273a0bd..a0c6ec3 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; Make sure that m0 is not reinitialized in the loop.
diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
index 7e7de64..69a871f 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=SPLIT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefix=SPLIT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefix=SPLIT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefix=ALIGNED-GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefix=UNALIGNED-GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=ALIGNED-GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefix=ALIGNED-GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode -early-live-intervals < %s | FileCheck -check-prefix=ALIGNED-GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefix=UNALIGNED-GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=SPLIT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefix=SPLIT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefix=SPLIT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=ALIGNED-GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefix=UNALIGNED-GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=ALIGNED-GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=ALIGNED-GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode -early-live-intervals < %s | FileCheck -check-prefix=ALIGNED-GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefix=UNALIGNED-GFX11 %s
define amdgpu_kernel void @test_local_misaligned_v2(ptr addrspace(3) %arg) {
; SPLIT-LABEL: test_local_misaligned_v2:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll b/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll
index db4e3e8..190a9a3 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s
; The test is for a bug in R600EmitClauseMarkers.cpp where this pass
; was searching for a use of the OQAP register in order to determine
diff --git a/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
index 771590f..3bbc060 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck %s
;
; This test checks that the lds input queue will is empty at the end of
; the ALU clause.
diff --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
index 455bb6b..3c55dcb 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -filetype=obj < %s | llvm-readobj -r --syms - | FileCheck -check-prefixes=ELF %s
@lds.external = external unnamed_addr addrspace(3) global [0 x i32]
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll
index 6ebfc9a..878d204 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=MESA %s
; gfx950 supports upto 160 KB configurable LDS memory.
; This test checks the max and above the old i.e. 128 KiB size of LDS that can be allocated.
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll
index 22cad8a..977b469 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=PAL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 < %s | FileCheck -check-prefix=PAL %s
; GFX950supports upto 160 KB configurable LDS memory.
; This test checks the min and max size of LDS that can be allocated.
@@ -23,4 +23,4 @@ define amdgpu_gfx void @test_lds_array_i32() {
%val = load i32, ptr addrspace(3) %gep
store i32 %val, ptr addrspace(3) @lds.i32
ret void
-} \ No newline at end of file
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
index a756a0b..e9448bc 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: not llc -mtriple=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s
; RUN: not llc -mtriple=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll
index d76b6b2..1280531 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
; Natural mapping
define half @raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll
index ba32203..338b0ea 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
; Natural mapping
define float @raw_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll
index 0d110de..873c701 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
; Natural mapping
define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll
index c443e6a..7e020dd 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16(<4 x i32> %rsrc, half %val, i32 %voffset, i32 %soffset) {
; GFX908-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll
index 8f7ada6..f999515 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32(<4 x i32> %rsrc, float %val, i32 %voffset, i32 %soffset) {
; GFX908-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll
index 7707706..eb95368 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
; Natural mapping
define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, float %val, i32 %voffset, i32 %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll
index 31225a3..3012767 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
; Natural mapping
define half @raw_ptr_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll
index 750284a..07b63a8 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
; Natural mapping
define float @raw_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll
index fedf751..c9c24e2 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
; Natural mapping
define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll
index 67a2d97..85d4ddc 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16(ptr addrspace(8) %rsrc, half %val, i32 %voffset, i32 %soffset) {
; GFX908-LABEL: name: raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll
index d70a4b6..89dbb03 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32(ptr addrspace(8) %rsrc, float %val, i32 %voffset, i32 %soffset) {
; GFX908-LABEL: name: raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll
index d53fd61..c44ebaf 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
; Natural mapping
define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(ptr addrspace(8) %rsrc, float %val, i32 %voffset, i32 %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll
index dd72f4e..1d1d4a4 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) {
; GFX908-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll
index e215afa..37902cd 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) {
; GFX908-LABEL: name: raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll
index 14466b8..688aaaf 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) {
; GFX908-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll
index 8ebd91945..eb5416e 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
; Natural mapping
define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll
index 4ea8685..61c260e 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 %soffset) {
; GFX908-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll
index c7c60a1..8261461 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, i32 %voffset, i32 %soffset) {
; GFX908-LABEL: name: raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll
index f2e0c4a..84f4258 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset) {
; GFX908-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll
index 3e05d58..63f0e43 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
; Natural mapping
define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll b/llvm/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll
index 5994888..5cdb04d 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-new-pm -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s
; Type legalization for illegal FP type results was dropping invariant
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll b/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll
index b2f2c31..893f6b1 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefix=GFX908 %s
;; Older intrinsics that take <4 x i32>
diff --git a/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll b/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll
index 761e3ae..f607385 100644
--- a/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-s-branch-bits=6 -amdgpu-long-branch-factor=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-s-branch-bits=6 -amdgpu-long-branch-factor=0 < %s | FileCheck -check-prefix=GCN %s
; Restrict maximum branch to between +31 and -32 dwords
diff --git a/llvm/test/CodeGen/AMDGPU/literal64.ll b/llvm/test/CodeGen/AMDGPU/literal64.ll
index df4ff2c..768c972 100644
--- a/llvm/test/CodeGen/AMDGPU/literal64.ll
+++ b/llvm/test/CodeGen/AMDGPU/literal64.ll
@@ -12,21 +12,11 @@ define amdgpu_ps i64 @s_add_u64(i64 inreg %a) {
}
define amdgpu_ps void @v_add_u64(i64 %a, ptr addrspace(1) %out) {
-; GCN-SDAG-LABEL: v_add_u64:
-; GCN-SDAG: ; %bb.0:
-; GCN-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xf12345678)
-; GCN-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GCN-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off
-; GCN-SDAG-NEXT: s_endpgm
-;
-; GCN-GISEL-LABEL: v_add_u64:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[4:5], lit64(0xf12345678)
-; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
-; GCN-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off
-; GCN-GISEL-NEXT: s_endpgm
+; GCN-LABEL: v_add_u64:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0xf12345678), v[0:1]
+; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT: s_endpgm
%result = add i64 %a, 64729929336
store i64 %result, ptr addrspace(1) %out, align 8
ret void
@@ -42,21 +32,11 @@ define amdgpu_ps i64 @s_add_neg_u64(i64 inreg %a) {
}
define amdgpu_ps void @v_add_neg_u64(i64 %a, ptr addrspace(1) %out) {
-; GCN-SDAG-LABEL: v_add_neg_u64:
-; GCN-SDAG: ; %bb.0:
-; GCN-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xfffffff0edcba988)
-; GCN-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GCN-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off
-; GCN-SDAG-NEXT: s_endpgm
-;
-; GCN-GISEL-LABEL: v_add_neg_u64:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[4:5], lit64(0xfffffff0edcba988)
-; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
-; GCN-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off
-; GCN-GISEL-NEXT: s_endpgm
+; GCN-LABEL: v_add_neg_u64:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0xfffffff0edcba988), v[0:1]
+; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT: s_endpgm
%result = sub i64 %a, 64729929336
store i64 %result, ptr addrspace(1) %out, align 8
ret void
@@ -74,9 +54,7 @@ define amdgpu_ps i64 @s_sub_u64(i64 inreg %a) {
define amdgpu_ps void @v_sub_u64(i64 %a, ptr addrspace(1) %out) {
; GCN-LABEL: v_sub_u64:
; GCN: ; %bb.0:
-; GCN-NEXT: v_sub_co_u32 v0, vcc_lo, 0x12345678, v0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_sub_co_ci_u32_e64 v1, null, 15, v1, vcc_lo
+; GCN-NEXT: v_sub_nc_u64_e32 v[0:1], lit64(0xf12345678), v[0:1]
; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off
; GCN-NEXT: s_endpgm
%result = sub i64 64729929336, %a
@@ -94,15 +72,15 @@ define void @v_mov_b64_double(ptr addrspace(1) %ptr) {
; GCN-NEXT: .LBB6_1: ; %atomicrmw.start
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_add_f64_e32 v[2:3], lit64(0x4063233333333333), v[4:5]
; GCN-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GCN-NEXT: s_wait_xcnt 0x0
; GCN-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT: s_wait_alu 0xfffe
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GCN-NEXT: s_cbranch_execnz .LBB6_1
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -143,9 +121,7 @@ define i1 @class_f64() noinline optnone {
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: s_mov_b32 s2, 1
; GCN-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0x4063233333333333)
-; GCN-SDAG-NEXT: s_wait_alu 0xfffe
; GCN-SDAG-NEXT: v_cmp_class_f64_e64 s0, s[0:1], s2
-; GCN-SDAG-NEXT: s_wait_alu 0xf1ff
; GCN-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
@@ -155,13 +131,11 @@ define i1 @class_f64() noinline optnone {
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: s_mov_b32 s2, 1
; GCN-GISEL-NEXT: s_mov_b64 s[0:1], lit64(0x4063233333333333)
-; GCN-GISEL-NEXT: s_wait_alu 0xfffe
; GCN-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GCN-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GCN-GISEL-NEXT: v_cmp_class_f64_e64 s0, v[0:1], v2
; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 1
; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GCN-GISEL-NEXT: s_wait_alu 0xf1ff
; GCN-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, v0, s0
; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
%result = call i1 @llvm.amdgcn.class.f64(double 153.1, i32 1) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
index b77b2f7..3dd9252 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
declare i32 @llvm.amdgcn.alignbyte(i32, i32, i32) #0
@@ -19,6 +21,30 @@ define amdgpu_kernel void @v_alignbyte_b32(ptr addrspace(1) %out, i32 %src1, i32
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
+; GFX9-LABEL: v_alignbyte_b32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_alignbyte_b32 v1, s0, v1, v2
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_alignbyte_b32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_alignbyte_b32 v0, s0, s1, v0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
; GFX11-TRUE16-LABEL: v_alignbyte_b32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
@@ -73,6 +99,41 @@ define amdgpu_kernel void @v_alignbyte_b32_2(ptr addrspace(1) %out, ptr addrspac
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
+; GFX9-LABEL: v_alignbyte_b32_2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x3c
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_alignbyte_b32 v1, v1, v2, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_alignbyte_b32_2:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x3c
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_alignbyte_b32 v0, v1, v0, s2
+; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
; GFX11-TRUE16-LABEL: v_alignbyte_b32_2:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
index 6fbd5ff..243cd59 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
; GFX12-LABEL: raw_buffer_atomic_cond_sub_return:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
index a0db4ea..37c57ef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -enable-var-scope -check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #2
declare i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
index 36b9dda..2f4ecb8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -enable-var-scope -check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #2
declare i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll
index 2dade84..ea8513f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950-GISEL %s
declare i32 @llvm.amdgcn.bitop3.i32(i32, i32, i32, i32)
declare i16 @llvm.amdgcn.bitop3.i16(i16, i16, i16, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
index 8ae571df..631fdc7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11 %s
declare i64 @llvm.amdgcn.s.bitreplicate(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll
index de484e3d..9ef082d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
;RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 | FileCheck %s --check-prefix=GCN
-;RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 | FileCheck %s --check-prefix=GCN
+;RUN: llc < %s -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 | FileCheck %s --check-prefix=GCN
define float @raw_buffer_load(<4 x i32> inreg) {
; GCN-LABEL: raw_buffer_load:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll
index 659842a..a9ff032 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,SI
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,GCNX3
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx600 | FileCheck %s -check-prefixes=CHECK,SI
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx700 | FileCheck %s -check-prefixes=CHECK,GCNX3
;CHECK-LABEL: {{^}}buffer_raw_load_immoffs_x3:
;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll
index 7723b56..ef29bbd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=CHECK
;CHECK-LABEL: {{^}}raw_buffer_store_format_immoffs_x3:
;CHECK-NOT: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
index 89dbe9b..92bdfe1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
declare half @llvm.fabs.f16(half %a)
declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index ae88ead..dedfda8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI %s
declare i1 @llvm.amdgcn.class.f32(float, i32) #1
declare i1 @llvm.amdgcn.class.f64(double, i32) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
index a36f83f..84c0809 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.amdgcn.cos.f16(half %a)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
index 39952d4..f580a7c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
declare float @llvm.amdgcn.cos.f32(float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
index c1e808c..6a5b2b2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare float @llvm.amdgcn.cubeid(float, float, float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
index 754f31c..37ebae7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare float @llvm.amdgcn.cubema(float, float, float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
index 328665f..1b28ffc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare float @llvm.amdgcn.cubesc(float, float, float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
index 26af411..6ff90e8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare float @llvm.amdgcn.cubetc(float, float, float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
index 25889de..9565314 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
@@ -9,6 +9,172 @@ declare half @llvm.amdgcn.cvt.f16.fp8(i32, i32)
declare <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16)
declare <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16)
+define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte0:
+; GFX1250-SDAG-REAL16: ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e32 v0.l, v0
+; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte0:
+; GFX1250-SDAG-FAKE16: ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte0:
+; GFX1250-GISEL-REAL16: ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e32 v0.l, v0
+; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte0:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
+ %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 0)
+ %ret = fpext half %cvt to float
+ ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte1:
+; GFX1250-SDAG-REAL16: ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1
+; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte1:
+; GFX1250-SDAG-FAKE16: ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:1
+; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte1:
+; GFX1250-GISEL-REAL16: ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1
+; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte1:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:1
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
+ %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 1)
+ %ret = fpext half %cvt to float
+ ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte2:
+; GFX1250-SDAG-REAL16: ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2
+; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte2:
+; GFX1250-SDAG-FAKE16: ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:2
+; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte2:
+; GFX1250-GISEL-REAL16: ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2
+; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte2:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:2
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
+ %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 2)
+ %ret = fpext half %cvt to float
+ ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3:
+; GFX1250-SDAG-REAL16: ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3
+; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3:
+; GFX1250-SDAG-FAKE16: ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3
+; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3:
+; GFX1250-GISEL-REAL16: ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3
+; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
+ %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 3)
+ %ret = fpext half %cvt to float
+ ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_f16_bf8_byte3_hi(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi:
+; GFX1250-SDAG-REAL16: ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.h, v0 byte_sel:3
+; GFX1250-SDAG-REAL16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi:
+; GFX1250-SDAG-FAKE16: ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3
+; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, 0, 0x5040100
+; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi:
+; GFX1250-GISEL-REAL16: ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3
+; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT: v_lshl_or_b32 v0, v0, 16, 0
+; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, 0
+; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
+ %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 3)
+ %ins.0 = insertelement <2 x half> undef, half 0.0, i32 0
+ %ins.1 = insertelement <2 x half> %ins.0, half %cvt, i32 1
+ %ret = bitcast <2 x half> %ins.1 to float
+ ret float %ret
+}
+
define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) {
; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte0:
; GFX1250-SDAG-REAL16: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
index aaaa751..856290a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index 09b1ea7..b84fb52 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX942 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX942 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9X,GFX942 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9X,GFX942 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
index ad547a3..3190515 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[4:5], 0x{{9|24}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
index 82ac2bd..b9bfb6d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[4:5], 0x{{9|24}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
index 6cdfcb8..f8eae31 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[4:5], 0x{{9|24}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
index ebd40c2..2d1bc79 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[4:5], 0x{{9|24}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 3e31c1b..42e73d1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11
define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x, float %y) #0 {
; SI-LABEL: s_cvt_pkrtz_v2f16_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
index 4b113d8..291a4e2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-SDAG %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX950-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX950-GISEL %s
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
index 4e5b853..7067496 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-GISEL %s
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale)
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float %scale)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll
index d3851b1..fec30ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
declare i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 %dst_sel)
declare i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 %dst_sel)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll
index 7433f66..ea887a2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 %dst_sel)
declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 %dst_sel)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
index 18b20e1..854708a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-GISEL %s
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> %src, i32 %sr, float %scale)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index 4fe6eed..b24f026 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri < %s 2>&1 | FileCheck -check-prefix=ERROR %s
; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
index f1d3d56..fb29a57 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.amdgcn.div.fixup.f16(half %a, half %b, half %c)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
index 41eb4d2..4a71fce 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
declare float @llvm.amdgcn.div.fixup.f32(float, float, float) nounwind readnone
declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 76cff96..a9a6431 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; XUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; XUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
; FIXME: Enable for VI.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index 9b9d864..c2393d3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
index 8ea10f4..796f6b8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
define float @test_amdgcn_dot4_f32_fp8_bf8(i32 %a, i32 %b, float %c) {
; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll
index 9aedaae..e0416ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
declare i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32, i32 immarg)
declare i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll
index 2776e24..8224fe4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll
@@ -1,11 +1,11 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
; GCN-LABEL: {{^}}ds_append_lds:
; GCN: s_load_dword [[PTR:s[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.fi.b32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.fi.b32.ll
index ea85055..495a5a3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.fi.b32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.fi.b32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
declare i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
index 90e18a8..5828af5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck %s
declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll
index 644ecf2..02cb7fb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
declare { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32, i32, <4 x i32>, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll
index 5795af7..b54a212 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll
@@ -1,11 +1,11 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
; GCN-LABEL: {{^}}ds_consume_lds:
; GCN: s_load_dword [[PTR:s[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
index dcbfef0..4719ab9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -stop-after=postrapseudos -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MIR %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -stop-after=postrapseudos -o - < %s | FileCheck -enable-var-scope -check-prefix=MIR %s
; MIR-LABEL: name: gws_barrier_offset0{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index 1e03151..c5f6e2b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -1,21 +1,21 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX9 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX9 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX9 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos.
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -verify-machineinstrs < %s | FileCheck -check-prefix=MIR %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -verify-machineinstrs < %s | FileCheck -check-prefix=MIR %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos -verify-machineinstrs < %s | FileCheck -check-prefix=MIR %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos -verify-machineinstrs < %s | FileCheck -check-prefix=MIR %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos < %s | FileCheck -check-prefix=MIR %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos < %s | FileCheck -check-prefix=MIR %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos < %s | FileCheck -check-prefix=MIR %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos < %s | FileCheck -check-prefix=MIR %s
; Minimum offset
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
index 0949a60..9df09ad 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
@@ -1,15 +1,15 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
; Minimum offset
; GCN-LABEL: {{^}}gws_init_offset0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.br.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.br.ll
index da64f73..a201aa8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.br.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.br.ll
@@ -1,15 +1,15 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
; GCN-LABEL: {{^}}gws_sema_br_offset0:
; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll
index 180ea84..04bca85 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s
; GCN-LABEL: {{^}}gws_sema_p_offset0:
; NOLOOP-DAG: s_mov_b32 m0, 0{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll
index 16dce87..ccee4b1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll
@@ -1,16 +1,16 @@
; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - < %s 2>&1 | FileCheck -check-prefix=GFX6ERR-SDAG %s
; RUN: not llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - < %s 2>&1 | FileCheck -check-prefix=GFX6ERR-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
; GFX6ERR-SDAG: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.ds.gws.sema.release.all
; GFX6ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.sema.release.all), %{{[0-9]+}}:sgpr(s32) :: (store (s32) into custom "GWSResource") (in function: gws_sema_release_all_offset0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll
index 215c394..1ebd61c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll
@@ -1,15 +1,15 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,LOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,NOLOOP %s
; GCN-LABEL: {{^}}gws_sema_v_offset0:
; NOLOOP-DAG: s_mov_b32 m0, 0{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll
index c35bb9f..0ae5a86 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll
@@ -1,7 +1,7 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN %s
; GCN-LABEL: {{^}}ds_ordered_add:
; GCN-DAG: v_{{(dual_)?}}mov_b32{{(_e32)?}} v[[INCR:[0-9]+]], 31
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll
index 30a7235..bbdf60c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
; GFX12-ERR: LLVM ERROR: Cannot select: {{.*}} = DS_ORDERED_COUNT
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll
index bdec2c8..0490b91 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll
@@ -1,9 +1,9 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
; FUNC-LABEL: {{^}}ds_ordered_add:
; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll
index 79288d7..6bff143 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll
@@ -1,9 +1,9 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
; FUNC-LABEL: {{^}}ds_ordered_swap:
; GCN: s_mov_b32 m0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
index 6581e25..a16b62e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck %s
declare i32 @llvm.amdgcn.ds.permute(i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
index eb5bded..f504f2c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-SDAG %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950-GISEL %s
declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll
index bb1c460..d5ea159 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
declare i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32, i32 immarg)
declare i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
index 038ba91..90ba893 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck %s
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll
index 34b7a23..f10a717 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s
-; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s 2>&1 | FileCheck -check-prefix=ERR %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s 2>&1 | FileCheck -check-prefix=ERR %s
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
index 3dbda35..2ec907e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
-; RUN: not llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck -strict-whitespace -check-prefix=ERR %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx1100 < %s 2>&1 | FileCheck -strict-whitespace -check-prefix=ERR %s
; ERR: error: <unknown>:0:0: in function test_export_compr_zeroes_v2f16 void (): intrinsic not supported on subtarget
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
index c506e08..f921ad3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX8,PREGFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX10,PREGFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX8,PREGFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX10,PREGFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll
index 1ad083a..a08dca8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=NOPRIM %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=PRIM %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=PRIM %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=PRIM %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=NOPRIM %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=PRIM %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=PRIM %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=PRIM %s
declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
index 18923d3..af73475 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12
declare void @llvm.amdgcn.exp.row.i32(i32, i32, i32, i32, i32, i32, i1, i32)
declare void @llvm.amdgcn.exp.row.f32(i32, i32, float, float, float, float, i1, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index c5becb1..87a9ba3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" < %s | FileCheck -check-prefixes=SDAG-GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" < %s | FileCheck -check-prefixes=SDAG-GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" < %s | FileCheck -check-prefixes=GISEL-GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" < %s | FileCheck -check-prefixes=GISEL-GFX10 %s
declare i32 @llvm.amdgcn.fcmp.f32(float, float, i32) #0
declare i32 @llvm.amdgcn.fcmp.f64(double, double, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index ec100a9..9e48246 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64" -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64" -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64" < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64" < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI-GISEL %s
declare i64 @llvm.amdgcn.fcmp.f32(float, float, i32) #0
declare i64 @llvm.amdgcn.fcmp.f64(double, double, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
index 212c286..2c21b57 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
declare float @llvm.amdgcn.fdiv.fast(float, float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index addb395..4419b8c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16
; FIXME: GlobalIsel doesn't support BF16 for now.
-; xUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16
-; xUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16
+; xUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16
+; xUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16
declare bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> %a, <2 x bfloat> %b, bfloat %c)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 19e0348..0194d25 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16
declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 159592c..dda2e15 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck %s --check-prefixes=GFX950
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck %s --check-prefixes=GFX950-ISEL
declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 %clamp)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
index 4d31e30..98cb096 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX906
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX942
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX906
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s | FileCheck %s --check-prefixes=GCN,GFX12
declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
new file mode 100644
index 0000000..89555d3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+declare void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 %col)
+
+define amdgpu_ps void @flat_prefetch(ptr %ptr) {
+; GCN-LABEL: flat_prefetch:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr(ptr inreg %ptr) {
+; GCN-LABEL: flat_prefetch_sgpr:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_offset(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] offset:512
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr %ptr, i32 128
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr_voffset(ptr inreg %ptr, i32 %offset) {
+; GCN-LABEL: flat_prefetch_sgpr_voffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr %ptr, i32 %offset
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr_voffset_offset(ptr inreg %ptr, i32 %offset) {
+; GCN-LABEL: flat_prefetch_sgpr_voffset_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128
+; GCN-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr %ptr, i32 %offset
+ %gep2 = getelementptr i8, ptr %gep1, i32 128
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep2, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_se(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_se:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 8)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_se_nt(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_se_nt:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_NT scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 9)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_dev_ht(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_dev_ht:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_HT scope:SCOPE_DEV
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 18)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sys_lu(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_sys_lu:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 27)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
index 64c54ca..a41bf50 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
declare half @llvm.amdgcn.fmad.ftz.f16(half %a, half %b, half %c)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
index 4a735a7..1fdeef7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
declare float @llvm.amdgcn.fmad.ftz.f32(float %a, float %b, float %c)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
index 3860838..783a7c1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_fmed3_f16:
; GCN: v_med3_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
index 588b8c3..561f4e3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_fmed3:
; GCN: v_med3_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
index 78768c8..c5daf21 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
@@ -1,10 +1,10 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX101 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOMADMACF32,GFX103 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX101 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,NOMADMACF32,GFX103 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
; GCN-LABEL: {{^}}test_mul_legacy_f32:
; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
index 7354ed5..4dcf1c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.amdgcn.fract.f16(half %a)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
index 361a42a..f1733d7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
declare float @llvm.amdgcn.fract.f32(float) #0
declare double @llvm.amdgcn.fract.f64(double) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
index 97eb86f..185e5ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare i16 @llvm.amdgcn.frexp.exp.i16.f16(half %a)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
index 43f2a5a..7356b7a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
declare float @llvm.fabs.f32(float) #0
declare float @llvm.copysign.f32(float, float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
index 7085932..62111c2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.amdgcn.frexp.mant.f16(half %a)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
index a27034a..4e623dd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
declare float @llvm.fabs.f32(float) #0
declare double @llvm.fabs.f64(double) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
index 4a66b76..b05f141 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,PREGFX12
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,PREGFX12
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX12PLUS
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefixes=GCN,PREGFX12
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1031 | FileCheck %s -check-prefixes=GCN,PREGFX12
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefixes=GCN,GFX12PLUS
declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
index 968c198..8476bea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
declare i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1), i64)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll
index d8618cb..6275dfd9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX900
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX90A
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX942
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX900-GISEL
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefix=GFX900
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s --check-prefix=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefix=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s --check-prefix=GFX10
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefix=GFX900-GISEL
declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index 537aab9..b4acd5c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index 4db256d..0c5922e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1))
declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
new file mode 100644
index 0000000..047a6cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+declare void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 %col)
+
+define amdgpu_ps void @global_prefetch(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr(ptr addrspace(1) inreg %ptr) {
+; GCN-LABEL: global_prefetch_sgpr:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_offset(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off offset:512
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 128
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr_voffset(ptr addrspace(1) inreg %ptr, i32 %offset) {
+; GCN-LABEL: global_prefetch_sgpr_voffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr_voffset_offset(ptr addrspace(1) inreg %ptr, i32 %offset) {
+; GCN-LABEL: global_prefetch_sgpr_voffset_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1] offset:128
+; GCN-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+ %gep2 = getelementptr i8, ptr addrspace(1) %gep1, i32 128
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep2, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_se(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_se:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 8)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_se_nt(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_se_nt:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_NT scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 9)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_dev_ht(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_dev_ht:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_HT scope:SCOPE_DEV
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 18)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sys_lu(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_sys_lu:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 27)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
index f8a7177..4c422bc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
@@ -1,10 +1,10 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,NOHSA %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=CHECK,NOHSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=CHECK,HSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CHECK,HSA %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,NOHSA %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=CHECK,NOHSA %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=CHECK,HSA %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CHECK,HSA %s
@lds0 = addrspace(3) global [512 x float] poison, align 4
@lds1 = addrspace(3) global [256 x float] poison, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index 260b6fb3..e2b068e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11,SDAG-GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,SDAG-GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" < %s | FileCheck -check-prefixes=GCN,GFX11,SDAG-GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" < %s | FileCheck -check-prefixes=GCN,GFX10,SDAG-GFX10 %s
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s 2>%t | FileCheck -check-prefixes=GCN,GFX11,GISEL-GFX11 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" < %s 2>%t | FileCheck -check-prefixes=GCN,GFX11,GISEL-GFX11 %s
; RUN: FileCheck --check-prefix=ERR %s < %t
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s 2>%t | FileCheck -check-prefixes=GCN,GFX10,GISEL-GFX10 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" < %s 2>%t | FileCheck -check-prefixes=GCN,GFX10,GISEL-GFX10 %s
; RUN: FileCheck --check-prefix=ERR %s < %t
; Note: GlobalISel abort is disabled so we don't crash on i1 inputs.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index 13a53f0..366b71b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11,SDAG-GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SDAG-VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,SDAG-GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64" < %s | FileCheck -check-prefixes=GCN,GFX11,SDAG-GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,VI,SDAG-VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,SDAG-GFX9 %s
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64" -verify-machineinstrs < %s 2>%t | FileCheck -check-prefixes=GCN,GFX11,GISEL-GFX11 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64" < %s 2>%t | FileCheck -check-prefixes=GCN,GFX11,GISEL-GFX11 %s
; RUN: FileCheck --check-prefix=ERR %s < %t
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s 2>%t | FileCheck -check-prefixes=GCN,VI,GISEL-VI %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=fiji < %s 2>%t | FileCheck -check-prefixes=GCN,VI,GISEL-VI %s
; RUN: FileCheck --check-prefix=ERR %s < %t
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>%t | FileCheck -check-prefixes=GCN,GFX9,GISEL-GFX9 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx900 < %s 2>%t | FileCheck -check-prefixes=GCN,GFX9,GISEL-GFX9 %s
; RUN: FileCheck --check-prefix=ERR %s < %t
; Note: GlobalISel abort is disabled so we don't crash on i1 inputs.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 565ad29..fc0f4eb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @test_iglp_opt() #0 {
; GCN-LABEL: test_iglp_opt:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
index 8e37d2f..713f82e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
; GFX9-LABEL: load_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll
index a661730..eacdd91 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll
@@ -1,10 +1,10 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6789 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6789 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -early-live-intervals < %s | FileCheck -check-prefixes=GCN,GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=GCN,GFX6789 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX6789 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals < %s | FileCheck -check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX12 %s
; GCN-LABEL: {{^}}atomic_swap_1d:
; GFX6789: image_atomic_swap v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
index 7be0d9c..3d1d6c8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn -global-isel=1 -global-isel-abort=2 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn -global-isel=1 -global-isel-abort=2 -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12-GISEL %s
define amdgpu_ps float @atomic_pk_add_f16_1d_v2(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) {
; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll
index dbd324b..dc9b8f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,GFX89 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX89 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 | FileCheck -check-prefixes=GCN,GFX81,GFX89 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck -check-prefixes=GCN,PACKED,GFX89 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GCN,GFX12 %s
; GCN-LABEL: {{^}}image_load_f16:
; GFX89: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
index 4a2c1fe..ed7d88b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -early-live-intervals < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN %s
define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
; GCN-LABEL: load_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
index beed453..4d9f094 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=VERDE %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FIJI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=NOPRT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=VERDE %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=FIJI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX6789 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null < %s | FileCheck -check-prefixes=NOPRT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
; VERDE-LABEL: load_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
index 93f0080..3b4db4a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
; GFX9-LABEL: gather4_2d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll
index 3a5a608..c0cc079 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx10-1-generic --amdhsa-code-object-version=6 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=GCN,UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx10-1-generic --amdhsa-code-object-version=6 | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GCN,GFX12 %s
; GCN-LABEL: {{^}}image_gather4_b_2d_v4f16:
; UNPACKED: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll
index b5faae1..f6abd13 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX12,GFX6789 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX12,GFX6789 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,PREGFX12,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,PREGFX12,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=GCN,PREGFX12,GFX6789 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,PREGFX12,GFX6789 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,PREGFX12,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,PREGFX12,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX12 %s
; GCN-LABEL: {{^}}gather4_2d:
; GFX6789: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.o.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.o.dim.ll
index e7a57d5..a3bce37 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.o.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.o.dim.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX11-ERR %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 < %s 2>&1 | FileCheck -check-prefixes=GFX11-ERR %s
; GFX11-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.image.gather4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll
index fe65d6e..360b8cb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10,PRE-GFX12 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10,PRE-GFX12 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX10,PRE-GFX12 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX10,PRE-GFX12 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX12 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck --check-prefixes=GCN,PRE-GFX10,PRE-GFX12 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefixes=GCN,PRE-GFX10,PRE-GFX12 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefixes=GCN,GFX10,PRE-GFX12 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefixes=GCN,GFX10,PRE-GFX12 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefixes=GCN,GFX12 %s
; GCN-LABEL: {{^}}getlod_1d:
; PRE-GFX10: image_get_lod v[0:3], v0, s[0:7], s[8:11] dmask:0xf{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
index 9a5d4855..96f084e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
; GCN-LABEL: {{^}}load.f16.1d:
; GFX9: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 d16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
index 3e5a524..77bfe6b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
; GCN-LABEL: {{^}}load.f32.1d:
; GFX9: image_load v0, v0, s[0:7] dmask:0x1 unorm a16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
index f188d37..3d64ef1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
; GFX11-LABEL: load_2dmsaa:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.x.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.x.ll
index b5b5944..c17efc2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.x.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.x.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; GCN-LABEL: {{^}}load_2dmsaa:
; GFX10: image_msaa_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
index 14b9a40..78b35e9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
@@ -1,12 +1,12 @@
-; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s
-; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1010-NSA %s
-; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1030-NSA %s
-; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s
-; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX11-NSA %s
+; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=32 < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=2 < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s
+; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1010-NSA %s
+; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1030-NSA %s
+; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-nsa-encoding < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=32 < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=2 < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s
+; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX11-NSA %s
; Default NSA threshold is 3 addresses
; GCN-LABEL: {{^}}sample_2d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
index 4a58091..437f438 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
; GFX9-LABEL: sample_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll
index 6027d73..895c45a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) {
; GFX9-LABEL: sample_cd_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.dim.ll
index 28a0611..5fe9100 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.dim.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=VERDE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=VERDE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX6789 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) {
; VERDE-LABEL: sample_cd_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll
index 0e8770f..4303af99 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_cd_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
index 323d0fb..5a35c69 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=TONGA %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GFX81 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=TONGA %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 | FileCheck -check-prefixes=GFX81 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
; TONGA-LABEL: image_sample_2d_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll
index 46191c7..a6c77ff 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A,SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A,SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A,SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals < %s | FileCheck -check-prefixes=GFX90A,SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A,GISEL %s
; GFX90A-LABEL: {{^}}sample_1d:
; GFX90A-NOT: s_wqm_b64
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index a713b1d..8b60aa0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=VERDE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx10-1-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=VERDE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX6789 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 < %s | FileCheck -check-prefixes=GFX6789 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx10-1-generic --amdhsa-code-object-version=6 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
; VERDE-LABEL: sample_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
index 42fa415..f0ce166 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel < %s | FileCheck -check-prefixes=GFX10GISEL %s
; TODO: global-isel produces more code - there will need to be some more combines in the postregbankselectcombine phase
; Depends on some other changes to pass this test - those are in review separately
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
index b6a8a1c..45cebaf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_d_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll
index 67e6bb7..3685bcf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s
; GCN-LABEL: {{^}}sample_o_1d:
; GCN: image_sample_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
index fe76d9c..382c9c9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps void @store_f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
; GFX9-LABEL: store_f16_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
index 1110892..51e17f2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
; GFX9-LABEL: store_f32_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll
index 8598b78..31c578b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d < %s | FileCheck -check-prefix=GCN %s
; FIXME: Requires stack object to not assert
; GCN-LABEL: {{^}}test_ps:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
index f7f72ae..4d93afb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
@@ -1,7 +1,7 @@
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefixes=GCN,HSA,COV5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefixes=GCN,HSA,COV5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefixes=GCN,HSA,COV4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefixes=GCN,MESA %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri | FileCheck -check-prefixes=GCN,HSA,COV5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri | FileCheck -check-prefixes=GCN,HSA,COV5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri | FileCheck -check-prefixes=GCN,HSA,COV4 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti | FileCheck -check-prefixes=GCN,MESA %s
; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
index b61ca56..fb52371 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
@@ -1,9 +1,9 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}full_mask:
; GCN: s_mov_b64 exec, -1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
index d4ae040..626d0c1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
@@ -1,11 +1,11 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
; GCN-LABEL: {{^}}test_init_exec:
; GFX1032: s_mov_b32 exec_lo, 0x12345
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
index 2964f07..1ab4cb0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL12 %s
-; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL12 %s
-; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL10 %s
-; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL10 %s
+; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GISEL12 %s
+; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=DAGISEL12 %s
+; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefix=GISEL10 %s
+; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefix=DAGISEL10 %s
define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
; GISEL12-LABEL: basic:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
index 362b18f..613d557 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL12 %s
-; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL12 %s
-; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL10 %s
-; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL10 %s
+; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GISEL12 %s
+; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=DAGISEL12 %s
+; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GISEL10 %s
+; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=DAGISEL10 %s
; This shouldn't be too different from wave32, so we'll only test one case.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll
index 5d2e107..96b5566 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-32BANK %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8-32BANK %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8-16BANK %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-32BANK %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GFX8-32BANK %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx810 < %s | FileCheck -check-prefixes=GFX8-16BANK %s
define amdgpu_ps half @interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
; GFX9-32BANK-LABEL: interp_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
index 4d937da..46e2e92 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
; GFX11-LABEL: v_interp_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
index 704960c..64c55bf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=kabini -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,16BANK %s
-; RUN: llc -mtriple=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,16BANK %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=kabini < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,16BANK %s
+; RUN: llc -mtriple=amdgcn -mcpu=stoney < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,16BANK %s
; GCN-LABEL: {{^}}v_interp:
; GCN-NOT: s_wqm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 2c1b682..5b6fc6ae 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -1,14 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; TODO: Run these for global isel as well.
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1013 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1030 %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX11,GFX11-FAKE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1013 < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1013 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1030 %s
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1012 < %s 2>&1 | FileCheck -check-prefix=ERR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=PRE-GFX12,GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=PRE-GFX12,GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
index 8e244b5..835c924 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11,SDAG %s
; RUN: not llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -global-isel=1 < %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -global-isel=0 < %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
index 24e213e..114c81f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL_W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG_W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL_W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG_W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 -global-isel=1 < %s | FileCheck -check-prefix=GISEL_W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 -global-isel=0 < %s | FileCheck -check-prefix=SDAG_W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 -global-isel=1 < %s | FileCheck -check-prefix=GISEL_W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 -global-isel=0 < %s | FileCheck -check-prefix=SDAG_W32 %s
declare i1 @llvm.amdgcn.inverse.ballot.i64(i64)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 167c2c4..58adbd3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V4,HSA,ALL %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V4,OS-MESA3D,ALL %s
-; RUN: llc -mtriple=amdgcn-mesa-unknown -verify-machineinstrs < %s | FileCheck -check-prefixes=OS-UNKNOWN,ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=CO-V4,HSA,ALL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d < %s | FileCheck -check-prefixes=CO-V4,OS-MESA3D,ALL %s
+; RUN: llc -mtriple=amdgcn-mesa-unknown < %s | FileCheck -check-prefixes=OS-UNKNOWN,ALL %s
; ALL-LABEL: {{^}}test:
; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
index 94aad39..462090c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
; GCN-LABEL: {{^}}gs_const:
; GCN-NOT: v_cmpx
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll
index 6d1ca3f..948b7b8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
; GCN-LABEL: {{^}}lds_direct_load:
; GCN: s_mov_b32 m0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 0fe371c..3dc6c55 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.lds.kernel.id()
declare i32 @llvm.amdgcn.workgroup.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll
index 924d9eb..3d069db 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
; GCN-LABEL: {{^}}lds_param_load:
; GCN: s_mov_b32 m0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
index 9a2715b..43c69ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.lerp(i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
new file mode 100644
index 0000000..017d402
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
@@ -0,0 +1,201 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+declare i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1), i32)
+declare <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1), i32)
+declare <4 x i32> @llvm.amdgcn.global.load.monitor.b128.v4i32(ptr addrspace(1), i32)
+declare i32 @llvm.amdgcn.flat.load.monitor.b32.i32(ptr, i32)
+declare <2 x i32> @llvm.amdgcn.flat.load.monitor.b64.v2i32(ptr, i32)
+declare <4 x i32> @llvm.amdgcn.flat.load.monitor.b128.v4i32(ptr, i32)
+
+define amdgpu_ps void @global_load_monitor_b32_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_monitor_b32_vaddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_monitor_b32 v0, v[0:1], off offset:32 th:TH_LOAD_NT
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1) %gep, i32 1)
+ store i32 %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b32_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_monitor_b32_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_load_monitor_b32 v2, v2, s[0:1] offset:32 th:TH_LOAD_HT scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1) %gep, i32 10)
+ store i32 %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_monitor_b64_vaddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_monitor_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) %gep, i32 22)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_monitor_b64_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_load_monitor_b64 v[2:3], v2, s[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) %gep, i32 27)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b128_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_monitor_b128_vaddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_monitor_b128 v[4:7], v[0:1], off offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <4 x i32> @llvm.amdgcn.global.load.monitor.b128.v4i32(ptr addrspace(1) %gep, i32 0)
+ store <4 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b128_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_monitor_b128_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_load_monitor_b128 v[2:5], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <4 x i32> @llvm.amdgcn.global.load.monitor.b128.v4i32(ptr addrspace(1) %gep, i32 1)
+ store <4 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @flat_load_monitor_b32(ptr %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: flat_load_monitor_b32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_load_monitor_b32 v0, v[0:1] offset:32 th:TH_LOAD_HT scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(0) %addr, i32 4
+ %val = call i32 @llvm.amdgcn.flat.load.monitor.b32.i32(ptr addrspace(0) %gep, i32 10)
+ store i32 %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @flat_load_monitor_b64(ptr %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: flat_load_monitor_b64:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_load_monitor_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(0) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.flat.load.monitor.b64.v2i32(ptr addrspace(0) %gep, i32 22)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @flat_load_monitor_b128(ptr %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: flat_load_monitor_b128:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_load_monitor_b128 v[4:7], v[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(0) %addr, i32 4
+ %val = call <4 x i32> @llvm.amdgcn.flat.load.monitor.b128.v4i32(ptr addrspace(0) %gep, i32 27)
+ store <4 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b32_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) {
+; GFX1250-LABEL: global_load_monitor_b32_saddr_scale_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_monitor_b32 v2, v2, s[0:1] scale_offset th:TH_LOAD_NT
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %gep = getelementptr i32, ptr addrspace(1) %addr, i64 %idxprom
+ %val = call i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1) %gep, i32 1)
+ store i32 %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b64_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) {
+; GFX1250-LABEL: global_load_monitor_b64_saddr_scale_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_monitor_b64 v[2:3], v2, s[0:1] scale_offset th:TH_LOAD_NT
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i64 %idxprom
+ %val = call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) %gep, i32 1)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) {
+; GFX1250-SDAG-LABEL: global_load_monitor_b64_saddr_no_scale_offset:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
+; GFX1250-SDAG-NEXT: global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: global_load_monitor_b64_saddr_no_scale_offset:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
+; GFX1250-GISEL-NEXT: global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %gep = getelementptr i32, ptr addrspace(1) %addr, i64 %idxprom
+ %val = call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) %gep, i32 1)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
index 8ab46fa..5d03dfb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX90A
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX90A
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX942
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX942-GISEL
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefix=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s --check-prefix=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefix=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s --check-prefix=GFX10
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefix=GFX942-GISEL
;; Note: load.to.lds is a wrapper intrinsic around underlying operations.
;; This is a bare-bones test to ensure that it lowers to the correct instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
index b0a2d10..dcf76a7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: not llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck -check-prefix=ERR %s
; ERR: intrinsic not supported on subtarget
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
index dfde1032..847957d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck %s
define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
index 6b6fb30..1585a2c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index a9cffd6..4c26961 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index ec4e1cb..b792a12 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -1,12 +1,12 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,VGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,VGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,AGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,AGPRCD %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,VGPRCD %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,VGPRCD %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,AGPRCD %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,AGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,VGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,VGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,AGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,AGPRCD %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,VGPRCD %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,VGPRCD %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,AGPRCD %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,AGPRCD %s
declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32)
declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 866dba77..9bdae28f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
index d5ccc28..ccee113 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 561eaca..ff305da 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_42 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,GFX90A_42 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_42 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,GFX90A_42 %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index c98929c..7193fee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
index 96975bd..8fbf131 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT,PREGFX10,PREGFX10-OPT %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-NOOPT,PREGFX10,PREGFX10-NOOPT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT,PREGFX10,PREGFX10-OPT %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-NOOPT,PREGFX10,PREGFX10-NOOPT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT %s
; FIXME: The register allocator / scheduler should be able to avoid these hazards.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
index 481e721..e7d8683 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
@@ -1,9 +1,9 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GFX10PLUS %s
; GFX10PLUS-LABEL: {{^}}dpp8_test:
; GFX10PLUS: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
index 3a5519a..1d555f87 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
index 9e6a161..a271bcd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64, i32, <4 x i32>) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
index 63d71a1..1b64e08 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.msad.u8(i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll
index 9944352..5a73374 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_mul_i24:
; GCN: v_mul_i32_i24
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll
index 6768475..38a80c1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_mul_u24:
; GCN: v_mul_u32_u24
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.i24.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.i24.ll
index 5a37673..b57a81f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.i24.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.i24.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
define i32 @basic(i32 %a, i32 %b) {
; CHECK-LABEL: basic:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.u24.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.u24.ll
index db325a2..8fad2e7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.u24.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.u24.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
define i32 @basic(i32 %a, i32 %b) {
; CHECK-LABEL: basic:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll
index f5f51f6..d639ae0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.perm(i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 8506e75..4c6095e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1)
declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
index 10c0000..3d13593 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-SDAG %s
-; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s
-; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-SDAG %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-SDAG %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
define void @v_permlane16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %src2) {
; GFX10-SDAG-LABEL: v_permlane16_p0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
index 33f0d60..356b767 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
-; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
declare i32 @llvm.amdgcn.permlane16.var(i32, i32, i32, i1, i1)
declare i32 @llvm.amdgcn.permlanex16.var(i32, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index 6698d36..6dd2258 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-SDAG %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-GISEL %s
declare i32 @llvm.amdgcn.permlane64(i32)
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
index 393d8c1..b0149f7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-SDAG %s
define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) {
; GFX11-SDAG-LABEL: test_p0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
index 465414c..6a5dc8f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
@@ -1,5 +1,5 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
index 1410939..2a2a401 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck %s
; CHECK-LABEL: {{^}}test1:
; CHECK: s_mov_b64 s[0:1], exec
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
index f81be1a..bd904be 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
index afc5807..de7d234 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11 %s
declare i32 @llvm.amdgcn.s.quadmask.i32(i32)
declare i64 @llvm.amdgcn.s.quadmask.i64(i64)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
index 7e16358..afb80e6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri < %s 2>&1 | FileCheck -check-prefix=ERROR %s
; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
index 30b7b3b..7a20b5c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) {
; CHECK-LABEL: raw_atomic_buffer_load_i32:
@@ -251,24 +251,26 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
; CHECK-FAKE16-NEXT: ; %bb.2: ; %bb2
; CHECK-FAKE16-NEXT: s_endpgm
;
-; CHECK-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
-; CHECK-GISEL-TRUE16: ; %bb.0: ; %bb
-; CHECK-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
-; CHECK-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1
-; CHECK-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
-; CHECK-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-GISEL-TRUE16-NEXT: s_endpgm
+; CHECK-GISEL-LABEL: raw_atomic_buffer_load_v4i16:
+; CHECK-GISEL: ; %bb.0: ; %bb
+; CHECK-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
+; CHECK-GISEL-NEXT: .LBB7_1: ; %bb1
+; CHECK-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v1
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v2
+; CHECK-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; CHECK-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; CHECK-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-GISEL-NEXT: s_cbranch_execnz .LBB7_1
+; CHECK-GISEL-NEXT: ; %bb.2: ; %bb2
+; CHECK-GISEL-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll
index 4919080..cf746ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -amdgpu-atomic-optimizer-strategy=None | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None | FileCheck %s
;CHECK-LABEL: {{^}}test1:
;CHECK-NOT: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll
index 8e064ab..ccb79d1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
; GCN-LABEL: {{^}}buffer_load_format_d16_x:
; GCN: buffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll
index 5e84ea5..939e91b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s
;CHECK-LABEL: {{^}}buffer_load:
;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll
index ffd055e9..bf57e28 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN
declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
index 5fe0cfb..e6a59f4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=PREGFX10
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=PREGFX10
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX12,GFX12-SDAG
-;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX12,GFX12-GISEL
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s --check-prefixes=PREGFX10
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefixes=PREGFX10
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefixes=GFX10
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 | FileCheck %s --check-prefixes=GFX11
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 | FileCheck %s --check-prefixes=GFX12,GFX12-SDAG
+;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 | FileCheck %s --check-prefixes=GFX12,GFX12-GISEL
define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
; PREGFX10-LABEL: buffer_load:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
index 6e24717..8a6594f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mcpu=tahiti -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX6
-; RUN: llc -mcpu=hawaii -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX7
-; RUN: llc -mcpu=fiji -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX8
-; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX910,GFX9
-; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX910,GFX10
-; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -mcpu=tahiti -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX67,GFX6
+; RUN: llc -mcpu=hawaii -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX67,GFX7
+; RUN: llc -mcpu=fiji -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX8
+; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX9
+; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX10
+; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX11
+; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
; GFX67-LABEL: raw_buffer_load_i8_tfe:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll
index cf1425c..79fba61 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %voffset) {
; GCN-LABEL: buffer_store_format_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll
index 2fe162c..03e0044 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefix=VERDE %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s
;CHECK-LABEL: {{^}}buffer_store:
;CHECK-NOT: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
index 3493de1..89511de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefixes=GFX68,VERDE %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=GFX68,GFX8 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
; GFX68-LABEL: buffer_store:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
index 643805d..561ec7d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %ptr) {
; CHECK-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
@@ -251,24 +251,26 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
; CHECK-FAKE16-NEXT: ; %bb.2: ; %bb2
; CHECK-FAKE16-NEXT: s_endpgm
;
-; CHECK-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
-; CHECK-GISEL-TRUE16: ; %bb.0: ; %bb
-; CHECK-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
-; CHECK-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1
-; CHECK-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
-; CHECK-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-GISEL-TRUE16-NEXT: s_endpgm
+; CHECK-GISEL-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; CHECK-GISEL: ; %bb.0: ; %bb
+; CHECK-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
+; CHECK-GISEL-NEXT: .LBB7_1: ; %bb1
+; CHECK-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v1
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v2
+; CHECK-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; CHECK-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; CHECK-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-GISEL-NEXT: s_cbranch_execnz .LBB7_1
+; CHECK-GISEL-NEXT: ; %bb.2: ; %bb2
+; CHECK-GISEL-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll
index 843ad56..0eb85e22 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -amdgpu-atomic-optimizer-strategy=None | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None | FileCheck %s
;CHECK-LABEL: {{^}}test1:
;CHECK-NOT: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll
index cafd903..638852b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 | FileCheck -enable-var-scope -check-prefixes=PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck -enable-var-scope -check-prefixes=PACKED %s
define amdgpu_ps half @buffer_load_format_d16_x(ptr addrspace(8) inreg %rsrc) {
; UNPACKED-LABEL: buffer_load_format_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.ll
index 8021391..e37b877 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s
;CHECK-LABEL: {{^}}buffer_load:
;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
index 51a8b97..f0204bd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN
declare void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index 3dc3ad1f..b5d741b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=PREGFX10
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=PREGFX10
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s --check-prefixes=PREGFX10
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefixes=PREGFX10
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefixes=GFX10
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 | FileCheck %s --check-prefixes=GFX11
define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) {
; PREGFX10-LABEL: buffer_load:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.d16.ll
index 6c23a87..1d2e325 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.d16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
define amdgpu_kernel void @buffer_store_format_d16_x(ptr addrspace(8) %rsrc, [8 x i32], half %data, [8 x i32], i32 %voffset) {
; GCN-LABEL: buffer_store_format_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.ll
index d2c9b4b..d7faaec 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.format.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefix=VERDE %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s
;CHECK-LABEL: {{^}}buffer_store:
;CHECK-NOT: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
index de1f859..91c479e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefix=VERDE %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s
define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) {
; VERDE-LABEL: buffer_store:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.d16.ll
index 381924e..a9ea440 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.d16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=PREGFX10-UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=PREGFX10-UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -enable-var-scope -check-prefixes=GFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
define amdgpu_ps half @tbuffer_load_d16_x(ptr addrspace(8) inreg %rsrc) {
; PREGFX10-UNPACKED-LABEL: tbuffer_load_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.ll
index cce9af9..b311525 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefix=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefix=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefix=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefix=GFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefix=GFX11 %s
define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(ptr addrspace(8) inreg) {
; PREGFX10-LABEL: tbuffer_load:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
index d8e2ce3..9a51b12 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10-UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-TRUE16 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-FAKE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=PREGFX10-UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 | FileCheck -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefixes=GFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-TRUE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-FAKE16 %s
define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll
index 9440efe..f778304c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefixes=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefixes=GFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps void @tbuffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) {
; PREGFX10-LABEL: tbuffer_store:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll
index bbac914..4cbf66b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=PREGFX10-UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX12-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX12-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=PREGFX10-UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -enable-var-scope -check-prefixes=GFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 | FileCheck -enable-var-scope -check-prefixes=GFX12-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 | FileCheck -enable-var-scope -check-prefixes=GFX12-PACKED %s
define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) {
; PREGFX10-UNPACKED-LABEL: tbuffer_load_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll
index c59f8bc..f01e85a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefix=GFX11 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefix=GFX12 %s
-;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefix=GFX12 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefix=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefix=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefix=GFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefix=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefix=GFX12 %s
+;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefix=GFX12 %s
define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
; PREGFX10-LABEL: tbuffer_load:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
index aad3532..6248da0c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
@@ -1,14 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10-UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-TRUE16 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-FAKE16 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-SDAG,GFX12-PACKED-SDAG-TRUE16 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-SDAG,GFX12-PACKED-SDAG-FAKE16 %s
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-GISEL,GFX12-PACKED-GISEL-TRUE16 %s
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-GISEL,GFX12-PACKED-GISEL-FAKE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=PREGFX10-UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 | FileCheck -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefixes=GFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-TRUE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-FAKE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-SDAG,GFX12-PACKED-SDAG-TRUE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-SDAG,GFX12-PACKED-SDAG-FAKE16 %s
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-GISEL,GFX12-PACKED-GISEL-TRUE16 %s
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-GISEL,GFX12-PACKED-GISEL-FAKE16 %s
define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
index 118fed1..8afa43a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefix=GFX11 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefix=GFX12 %s
-;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefix=GFX12 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefix=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefix=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefix=GFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefix=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefix=GFX12 %s
+;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
; PREGFX10-LABEL: tbuffer_store:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
index a2be749..9983c09 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefix=GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefix=GFX12-FAKE16 %s
declare half @llvm.amdgcn.rcp.f16(half %a)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
index d8975ba..392a99f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: not llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn -mcpu=fiji < %s 2>&1 | FileCheck -check-prefix=ERROR %s
; ERROR: error: <unknown>:0:0: in function rcp_legacy_f32 void (ptr addrspace(1), float): intrinsic not supported on subtarget
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
index 61900c0..425a853 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s
declare float @llvm.amdgcn.rcp.f32(float) #0
declare double @llvm.amdgcn.rcp.f64(double) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 49a334b..d1ba892 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll
index 2fba984..9037129 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10 -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX11 %s
; Test codegen with readfirstlane used by M0.
;
; M0 can only be written to by SALU instructions so we can't emit
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
index 3882a5f..395abf0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_p0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 42aab18..7ff5eb4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -global-isel -global-isel-abort=2 < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s
declare i32 @llvm.amdgcn.readlane.i32(i32, i32) #0
declare i64 @llvm.amdgcn.readlane.i64(i64, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
index 49f8ef3..ce34595 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
; CHECK-SDAG-LABEL: test_readlane_p0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
index acb5ba8..e879fb2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
declare float @llvm.amdgcn.rsq.clamp.f32(float) #1
declare double @llvm.amdgcn.rsq.clamp.f64(double) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
index bf37147..9f26745 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefix=GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefix=GFX12-FAKE16 %s
declare half @llvm.amdgcn.rsq.f16(half %a)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
index 2a07501..2e56c42 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare float @llvm.amdgcn.rsq.legacy(float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
index 7fea027..f99fe71 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare float @llvm.amdgcn.rsq.f32(float) #0
declare double @llvm.amdgcn.rsq.f64(double) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 96da9b9..90e150c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT0 %s
-; RUN: llc -mtriple=amdgcn -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT1 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT2 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT3 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT4 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT5 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT6 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=VARIANT0 %s
+; RUN: llc -mtriple=amdgcn -mattr=+auto-waitcnt-before-barrier < %s | FileCheck --check-prefix=VARIANT1 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=VARIANT2 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+auto-waitcnt-before-barrier < %s | FileCheck --check-prefix=VARIANT3 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=VARIANT4 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+auto-waitcnt-before-barrier < %s | FileCheck --check-prefix=VARIANT5 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=VARIANT6 %s
define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT0-LABEL: test_barrier:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
index e106d0e..5428b5e1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX678910,GFX6789,GFX678,GFX67,GFX6
-; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefixes=GFX678910,GFX6789,GFX78910,GFX678,GFX789,GFX67,GFX78,GFX7
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX678910,GFX6789,GFX78910,GFX678,GFX789,GFX8910,GFX78,GFX89,GFX8
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX678910,GFX6789,GFX78910,GFX789,GFX8910,GFX89,GFX910,GFX9
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX678910,GFX78910,GFX8910,GFX910,GFX10
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti | FileCheck %s -check-prefixes=GFX678910,GFX6789,GFX678,GFX67,GFX6
+; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX678910,GFX6789,GFX78910,GFX678,GFX789,GFX67,GFX78,GFX7
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX678910,GFX6789,GFX78910,GFX678,GFX789,GFX8910,GFX78,GFX89,GFX8
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX678910,GFX6789,GFX78910,GFX789,GFX8910,GFX89,GFX910,GFX9
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX678910,GFX78910,GFX8910,GFX910,GFX10
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 | FileCheck %s -check-prefixes=GFX11
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 | FileCheck %s -check-prefixes=GFX12
define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) {
; GFX67-LABEL: s_buffer_load_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
index 69ed9d5..9efe49d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
declare void @llvm.amdgcn.s.decperflevel(i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll
index ffab3449..344f5e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
declare i32 @llvm.amdgcn.s.get.waveid.in.workgroup() #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll
index 77bea2f..c7a12a7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
declare i64 @llvm.amdgcn.s.getpc() #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
index 52bdfbd..d64b1d2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
@@ -1,10 +1,10 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}s_getreg_test:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
index 3e35593..a5a080e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
declare void @llvm.amdgcn.s.incperflevel(i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
index 82468b6..819e507 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 < %s 2>&1 | FileCheck -check-prefix=ERR %s
; ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.s.memrealtime
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
index 1d7edb2..c8d03b8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SIVI,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=SIVI,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck --check-prefixes=SIVI,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=SIVI,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
declare i64 @llvm.amdgcn.s.memtime() #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.nop.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.nop.ll
index 2eb9833..24fdb5d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.nop.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.nop.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @test_s_nop() {
; GCN-LABEL: test_s_nop:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sethalt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sethalt.ll
index 374c646..34258d6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sethalt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sethalt.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @test_s_sethalt() {
; GCN-LABEL: test_s_sethalt:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll
index 087f798..8282ff3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -global-isel -mtriple=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -show-mc-encoding < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -global-isel -mtriple=amdgcn -show-mc-encoding < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 %s
declare void @llvm.amdgcn.s.setprio(i16) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll
index 05186ac..81e9df1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde -show-mc-encoding < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s
; FIXME: This copy of the test is a subset of the -global-isel version, since the VGPR case doesn't work.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
index e3a577e..d8f7edd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
declare void @llvm.amdgcn.s.sleep(i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
index f2ee110..11c2df9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=0 < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GCN %s
declare void @llvm.amdgcn.s.sleep.var(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
index 8aa8fac..6a05d6e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
declare void @llvm.amdgcn.s.ttracedata(i32)
declare void @llvm.amdgcn.s.ttracedata.imm(i16)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
index 433fefa..27a8b35 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
@@ -1,7 +1,7 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
; GCN-LABEL: {{^}}test_wait_event:
; GFX11: s_wait_event 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx12.ll
index ff8f28d..0d7bab1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx12.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
define amdgpu_ps void @test_bvhcnt() {
; GFX12-LABEL: test_bvhcnt:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
index efaf472..d8ed6a1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s
; CHECK-LABEL: {{^}}test1:
; CHECK-NOT: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
index 87c9213..44c88cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.sad.hi.u8(i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
index 2c013cc..d463d2c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.sad.u16(i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
index 84b663a..284f0b7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.sad.u8(i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
index 25b3617..e441d9a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg:
; GCN: v_bfe_i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
index 95e3446..9b88a10 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @test_sched_barrier() #0 {
; GCN-LABEL: test_sched_barrier:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
index 6507976..ae8ace2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s
define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
index 371b4f0..04fcdc6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-minreg < %s | FileCheck -check-prefix=GCN-MINREG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck -check-prefix=GCN-MAXOCC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-ilp < %s | FileCheck -check-prefix=GCN-ILP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched-cluster=0 -misched=gcn-iterative-minreg < %s | FileCheck -check-prefix=GCN-MINREG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched-cluster=0 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck -check-prefix=GCN-MAXOCC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched-cluster=0 -misched=gcn-iterative-ilp < %s | FileCheck -check-prefix=GCN-ILP %s
define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 73586b1..c8552d8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s
define amdgpu_kernel void @test_sched_group_barrier() #0 {
; GCN-LABEL: test_sched_group_barrier:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll
index ac54729..5a3e8d17 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX906
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX908
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck %s --check-prefixes=GCN,GFX906
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck %s --check-prefixes=GCN,GFX908
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s --check-prefixes=GCN,GFX10
declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
index fb44d11..3bfda26 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX906
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck %s --check-prefixes=GCN,GFX906
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GFX11
declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 %clamp)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
index 1c45a784..dc0c933 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
@@ -1,10 +1,10 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX906
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX908
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck %s --check-prefixes=GCN,GFX906
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck %s --check-prefixes=GCN,GFX908
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GFX11
declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
index fdd457c..7370a3b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
@@ -1,6 +1,6 @@
-;RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SIVI %s
-;RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VIPLUS,SIVI %s
-;RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VIPLUS,GFX9 %s
+;RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck --check-prefixes=GCN,SIVI %s
+;RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=GCN,VIPLUS,SIVI %s
+;RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GCN,VIPLUS,GFX9 %s
; GCN-LABEL: {{^}}test_interrupt:
; GCN: s_mov_b32 m0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
index 9a001e0..e4a87e3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s
define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_get_doorbell:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index fbf8c203..18098d0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GISEL11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,DAGISEL11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10,GISEL10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10,DAGISEL10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11_W64,GISEL11_W64 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11_W64,DAGISEL11_W64 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10_W64,GISEL10_W64 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10_W64,DAGISEL10_W64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GISEL11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,DAGISEL11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefixes=GFX10,GISEL10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefixes=GFX10,DAGISEL10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GFX11_W64,GISEL11_W64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GFX11_W64,DAGISEL11_W64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GFX10_W64,GISEL10_W64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GFX10_W64,DAGISEL10_W64 %s
define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %inactive, i32 %active) {
; GFX11-LABEL: set_inactive_chain_arg:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 6cb2d6d..32cbe6d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -early-live-intervals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -early-live-intervals < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
index 937b8bf..cfcac50 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.sffbh.i32(i32) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
index 0fe0640..75ea893 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.amdgcn.sin.f16(half %a)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
index 2b61cca..68c6670 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
declare float @llvm.amdgcn.sin.f32(float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
index 09abebd..b01977f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s
; Check that WQM is not triggered by the softwqm intrinsic alone.
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
index 2d8e9f2..f6f614e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL-TRUE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %index) {
; CHECK-LABEL: struct_atomic_buffer_load_i32:
@@ -307,27 +307,29 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
; CHECK-FAKE16-NEXT: ; %bb.2: ; %bb2
; CHECK-FAKE16-NEXT: s_endpgm
;
-; CHECK-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
-; CHECK-GISEL-TRUE16: ; %bb.0: ; %bb
-; CHECK-GISEL-TRUE16-NEXT: s_clause 0x1
-; CHECK-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
-; CHECK-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1
-; CHECK-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
-; CHECK-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-GISEL-TRUE16-NEXT: s_endpgm
+; CHECK-GISEL-LABEL: struct_atomic_buffer_load_v4i16:
+; CHECK-GISEL: ; %bb.0: ; %bb
+; CHECK-GISEL-NEXT: s_clause 0x1
+; CHECK-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34
+; CHECK-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s6
+; CHECK-GISEL-NEXT: .LBB8_1: ; %bb1
+; CHECK-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; CHECK-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; CHECK-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; CHECK-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-GISEL-NEXT: s_cbranch_execnz .LBB8_1
+; CHECK-GISEL-NEXT: ; %bb.2: ; %bb2
+; CHECK-GISEL-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll
index 88c67c6..0c0fd14 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -amdgpu-atomic-optimizer-strategy=None | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None | FileCheck %s
;CHECK-LABEL: {{^}}test1:
;CHECK-NOT: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll
index 9bf64ba..3dd22ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
; GCN-LABEL: {{^}}buffer_load_format_d16_x:
; GCN: buffer_load_format_d16_x v{{[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
index e81fee9..568fb12 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GFX6 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-enable-prt-strict-null -verify-machineinstrs | FileCheck --check-prefixes=NOPRT %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s
-;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck --check-prefixes=GFX6 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck --check-prefixes=GFX8PLUS %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefixes=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-enable-prt-strict-null | FileCheck --check-prefixes=NOPRT %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s
+;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s
define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
; GFX6-LABEL: buffer_load:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
index 74d5274..43323e7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mcpu=gfx810 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -mcpu=gfx1100 -mattr=+real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mcpu=gfx1200 -mattr=+real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mcpu=gfx810 -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mcpu=gfx1100 -mattr=+real-true16 -mtriple=amdgcn-- < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mcpu=gfx1200 -mattr=+real-true16 -mtriple=amdgcn-- < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
@esgs_ring = external addrspace(3) global [0 x i32], align 65536
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll
index 5b75294..01d0a66c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN
declare void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
index 9290b51..57aa103 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,VI
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefixes=CHECK,VI
;CHECK-LABEL: {{^}}buffer_load:
;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
index 60c04749..13b28d4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mcpu=tahiti -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX6
-; RUN: llc -mcpu=hawaii -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX7
-; RUN: llc -mcpu=fiji -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX8
-; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX910,GFX9
-; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX910,GFX10
-; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -mcpu=tahiti -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX67,GFX6
+; RUN: llc -mcpu=hawaii -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX67,GFX7
+; RUN: llc -mcpu=fiji -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX8
+; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX9
+; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX10
+; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX11
+; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
; GFX67-LABEL: struct_buffer_load_i8_tfe:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll
index 70e12ea..ff421d1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) {
; GCN-LABEL: buffer_store_format_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll
index 192b01a..21329de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefix=VERDE %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s
;CHECK-LABEL: {{^}}buffer_store:
;CHECK-NOT: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
index 4319bdd..9ce33c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefixes=GFX68,VERDE %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=GFX68,GFX8 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
; GFX68-LABEL: buffer_store:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
index ff5b17f..8f33dd6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL-TRUE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr, i32 %index) {
; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32:
@@ -307,27 +307,29 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
; CHECK-FAKE16-NEXT: ; %bb.2: ; %bb2
; CHECK-FAKE16-NEXT: s_endpgm
;
-; CHECK-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
-; CHECK-GISEL-TRUE16: ; %bb.0: ; %bb
-; CHECK-GISEL-TRUE16-NEXT: s_clause 0x1
-; CHECK-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
-; CHECK-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1
-; CHECK-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
-; CHECK-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-GISEL-TRUE16-NEXT: s_endpgm
+; CHECK-GISEL-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; CHECK-GISEL: ; %bb.0: ; %bb
+; CHECK-GISEL-NEXT: s_clause 0x1
+; CHECK-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34
+; CHECK-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s6
+; CHECK-GISEL-NEXT: .LBB8_1: ; %bb1
+; CHECK-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; CHECK-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; CHECK-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; CHECK-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-GISEL-NEXT: s_cbranch_execnz .LBB8_1
+; CHECK-GISEL-NEXT: ; %bb.2: ; %bb2
+; CHECK-GISEL-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.ll
index 607f600..b534088 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -amdgpu-atomic-optimizer-strategy=None | FileCheck %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None | FileCheck %s
;CHECK-LABEL: {{^}}test1:
;CHECK-NOT: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll
index 39df6ec..ca722147 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 | FileCheck -enable-var-scope -check-prefixes=PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck -enable-var-scope -check-prefixes=PACKED %s
define amdgpu_ps half @buffer_load_format_d16_x(ptr addrspace(8) inreg %rsrc) {
; UNPACKED-LABEL: buffer_load_format_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll
index 5b73d58..63bacf1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GFX6 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s
-;RUN: llc < %s -mtriple=amdgcn -mattr=-enable-prt-strict-null -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=NOPRT %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck --check-prefixes=GFX6 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck --check-prefixes=GFX8PLUS %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefixes=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mattr=-enable-prt-strict-null -mcpu=gfx1100 | FileCheck --check-prefixes=NOPRT %s
define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) {
; GFX6-LABEL: buffer_load:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll
index ff02c2e..0fbb302 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mcpu=gfx810 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -mcpu=gfx1100 -mattr=+real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mcpu=gfx810 -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mcpu=gfx1100 -mattr=+real-true16 -mtriple=amdgcn-- < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
@esgs_ring = external addrspace(3) global [0 x i32], align 65536
define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll
index 35c959f..4cfe686 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN
declare void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
index bfbc765..3c5dae0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,SI
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,VI
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s --check-prefixes=CHECK,SI
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefixes=CHECK,VI
define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) {
; CHECK-LABEL: buffer_load:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll
index 51d3687..8fea08d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
define amdgpu_kernel void @buffer_store_format_d16_x(ptr addrspace(8) %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) {
; GCN-LABEL: buffer_store_format_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll
index 61a08d9..3ded36a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=CHECK,SI %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VI %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefixes=CHECK,SI %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=CHECK,VI %s
define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) {
; CHECK-LABEL: buffer_store:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll
index d08623f..df94352 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=CHECK,SI %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VI %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefixes=CHECK,SI %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=CHECK,VI %s
define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) {
; CHECK-LABEL: buffer_store:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.load.d16.ll
index 2f26743..91c36cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.load.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.load.d16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=PREGFX10-UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=PREGFX10-UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -enable-var-scope -check-prefixes=GFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
define amdgpu_ps half @tbuffer_load_d16_x(ptr addrspace(8) inreg %rsrc) {
; PREGFX10-UNPACKED-LABEL: tbuffer_load_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.load.ll
index b144e37..e5185f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.load.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefixes=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefixes=GFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GFX11 %s
define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(ptr addrspace(8) inreg) {
; PREGFX10-LABEL: tbuffer_load:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll
index fc8f8af..eb28f63 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=PREGFX10-UNPACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-PACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=PREGFX10-UNPACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 < %s | FileCheck -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-FAKE16 %s
define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data, i32 %vindex) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll
index 753d17a..1955fc5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=VERDE %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefixes=VERDE %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefixes=GFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps void @tbuffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) {
; VERDE-LABEL: tbuffer_store:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll
index f93e188..dc08377 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=PREGFX10-UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX12-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX12-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=PREGFX10-UNPACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -enable-var-scope -check-prefixes=GFX10-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 | FileCheck -enable-var-scope -check-prefixes=GFX12-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 | FileCheck -enable-var-scope -check-prefixes=GFX12-PACKED %s
define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) {
; PREGFX10-UNPACKED-LABEL: tbuffer_load_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll
index 04539ff..b555c37 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefixes=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefixes=GFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
; PREGFX10-LABEL: tbuffer_load:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
index 268ac53..4f97075 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=PREGFX10-UNPACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=PREGFX10-PACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-PACKED %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-FAKE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-SDAG,GFX12-PACKED-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-SDAG,GFX12-PACKED-SDAG-FAKE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-SDAG,GFX12-PACKED-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-SDAG,GFX12-PACKED-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-GISEL,GFX12-PACKED-GISEL-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-GISEL,GFX12-PACKED-GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=PREGFX10-UNPACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 < %s | FileCheck -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=PREGFX10-PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-PACKED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11-PACKED,GFX11-PACKED-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-SDAG,GFX12-PACKED-SDAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-SDAG,GFX12-PACKED-SDAG-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-SDAG,GFX12-PACKED-SDAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-SDAG,GFX12-PACKED-SDAG-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-GISEL,GFX12-PACKED-GISEL-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12-PACKED,GFX12-PACKED-GISEL,GFX12-PACKED-GISEL-FAKE16 %s
define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
index ab0f189..3a0b2c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=VERDE %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefixes=VERDE %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=PREGFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefixes=GFX10 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
; VERDE-LABEL: tbuffer_store:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot4.ll
index 5013428..ea2bbf8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot4.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck %s --check-prefixes=GFX11
declare i32 @llvm.amdgcn.sudot4(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot8.ll
index 4355cc8..5be3308 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot8.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck %s --check-prefixes=GFX11
declare i32 @llvm.amdgcn.sudot8(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll
index c89c5c5..f0b02dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll
@@ -1,5 +1,5 @@
-; RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI
-; RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GCNX3
+; RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=gfx600 | FileCheck %s -check-prefixes=GCN,SI
+; RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=gfx700 | FileCheck %s -check-prefixes=GCN,GCNX3
; GCN-LABEL: {{^}}tbuffer_raw_load_immoffs_x3:
; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll
index d5cbadd..732967b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll
@@ -1,4 +1,4 @@
-;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN
+;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=gfx700 | FileCheck %s -check-prefixes=GCN
; GCN-LABEL: {{^}}tbuffer_raw_store_immoffs_x3:
; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
index 66708f6..bb32987 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index defaf70..d4aa2051 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=VI %s
define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
; SI-LABEL: bfe_u32_arg_arg_arg:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
index 33ef082..d0b432d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX906
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX942-SDAG
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX942-GISEL
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX906
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX942-SDAG
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX942-GISEL
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s --check-prefixes=GCN,GFX10
declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp)
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll
index c3de1db..2b28396 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck %s --check-prefixes=GCN,GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GCN,GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GCN,GFX10
declare i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c, i1 %clamp)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll
index c976962..5f586fd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck %s --check-prefixes=GCN,GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GCN,GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GCN,GFX10
declare i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 %clamp)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index 4441565..8b78c4e68 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11,GCN-OPT %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX11,GCN-OPT %s
define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
; GFX8-OPT-LABEL: dpp_test:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
index ca6bccd..f0031dd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1150,GFX1150-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1150,GFX1150-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1150,GFX1150-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1150,GFX1150-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_ps <3 x float> @gather_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, <4 x i32> inreg %samp2, float %s, float %t) {
; GFX11-LABEL: gather_sample:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
index 4b4bdfe..2e12340 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_wave_barrier:
; GCN-DAG: ; wave barrier
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
index b95cf86..f668a116 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: not --crash llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX9-SDAG-ERR %s
-; RUN: not llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX9-GISEL-ERR %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: not --crash llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=0 < %s 2>&1 | FileCheck -check-prefix=GFX9-SDAG-ERR %s
+; RUN: not llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=1 < %s 2>&1 | FileCheck -check-prefix=GFX9-GISEL-ERR %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefix=GFX12 %s
; GFX9-SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.wave.id
; GFX9-GISEL-ERR: LLVM ERROR: unable to legalize instruction: {{.*}} = G_INTRINSIC intrinsic(@llvm.amdgcn.wave.id)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 33dd2bd..2e880d6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GCN,W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,W64 %s
; GCN-LABEL: {{^}}fold_wavefrontsize:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
index 2f5ff90..9149ed5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
@@ -304,6 +304,556 @@ bb:
ret void
}
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 1, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
+; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off
+; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 2, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
+; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off
+; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
+; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off
+; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
+; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off
+; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v8i32(i32 4, <8 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
; GFX1250: ; %bb.0: ; %bb
@@ -815,6 +1365,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>,
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
@@ -824,6 +1375,7 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
+
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1)
declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
index fe8358f..12ea314 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
@@ -1342,6 +1342,110 @@ bb:
ret void
}
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34
+; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
+; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: s_mov_b32 s0, 1.0
+; GISEL-NEXT: s_mov_b32 s1, 2.0
+; GISEL-NEXT: s_mov_b32 s6, s0
+; GISEL-NEXT: s_mov_b32 s7, s0
+; GISEL-NEXT: s_mov_b32 s2, s0
+; GISEL-NEXT: s_mov_b32 s3, s0
+; GISEL-NEXT: s_mov_b32 s4, s0
+; GISEL-NEXT: s_mov_b32 s5, s0
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
+; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
+; GFX1250-NEXT: v_mov_b32_e32 v41, v34
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: s_mov_b32 s0, 0x40400000
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: s_mov_b32 s6, s0
+; GISEL-NEXT: s_mov_b32 s7, s0
+; GISEL-NEXT: s_mov_b32 s1, s0
+; GISEL-NEXT: s_mov_b32 s2, s0
+; GISEL-NEXT: s_mov_b32 s3, s0
+; GISEL-NEXT: s_mov_b32 s4, s0
+; GISEL-NEXT: s_mov_b32 s5, s0
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
; GFX1250: ; %bb.0: ; %bb
@@ -2227,6 +2331,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>,
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
index 9802144a..bf8308b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
@@ -1126,6 +1126,72 @@ bb:
ret void
}
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 1, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 3, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39]
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39]
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 4, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC:
; GFX1250: ; %bb.0: ; %bb
@@ -1967,6 +2033,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>,
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
index 3874a45..0a1df42 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 < %s | FileCheck %s --check-prefix=W32
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
index 25adc25..4f19d61 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=W64
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index 13ce979..7d3b316 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GFX10-32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-64 %s
define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
; SI-LABEL: static_exact:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll
index 91fc606..c9f4aca 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11 %s
declare i32 @llvm.amdgcn.s.wqm.i32(i32)
declare i64 @llvm.amdgcn.s.wqm.i64(i64)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
index 34c6149..f437cd2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
@@ -1,9 +1,9 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,WAVE64 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,WAVE64 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=CHECK,WAVE64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=CHECK,WAVE64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s
;CHECK-LABEL: {{^}}ret:
;CHECK: v_cmp_eq_u32_e32 [[CMP:[^,]+]], v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index e6cc8f9..a10c861 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX802-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX802-GISEL %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX802-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
declare i64 @llvm.amdgcn.writelane.i64(i64, i32, i64) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
index 40e1243..796884a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX802-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
; GFX802-SDAG-LABEL: test_writelane_p0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 32d8aa1..893dc39 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-flat-for-global < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
declare half @llvm.ceil.f16(half %a)
declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll
new file mode 100644
index 0000000..1015b75
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.cos.bf16(bfloat) #0
+
+define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 {
+; GCN-LABEL: cos_bf16_constant_4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_cos_bf16_e32 v0, 0x3f23
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: global_store_b16 v1, v0, s[0:1]
+; GCN-NEXT: s_endpgm
+ %cos = call bfloat @llvm.cos.bf16(bfloat 4.0) #0
+ store bfloat %cos, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 {
+; GCN-LABEL: cos_bf16_constant_100:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_cos_bf16_e32 v0, 0x417f
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: global_store_b16 v1, v0, s[0:1]
+; GCN-NEXT: s_endpgm
+ %cos = call bfloat @llvm.cos.bf16(bfloat 100.0) #0
+ store bfloat %cos, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 8c5bc4a..7d63e22 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX6 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope --check-prefix=GFX6 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX6-LABEL: cos_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
index fa50123..4d23fb1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOOPT %s
-; RUN: llc -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,OPT %s
+; RUN: llc -O0 -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=GCN,NOOPT %s
+; RUN: llc -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=GCN,OPT %s
; GCN-LABEL: {{^}}test_debug_value:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 978f223..8c1e166 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -5213,121 +5213,15 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 {
}
define float @v_exp_f32_undef() {
-; VI-SDAG-LABEL: v_exp_f32_undef:
-; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_rndne_f32_e32 v0, 0
-; VI-SDAG-NEXT: s_mov_b32 s4, 0x7fc00000
-; VI-SDAG-NEXT: v_add_f32_e64 v1, -v0, s4
-; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_ldexp_f32 v0, v1, v0
-; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_exp_f32_undef:
-; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sub_f32_e64 v0, s4, 0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8a000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x39a3b295
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v0
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v3
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v1
-; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2
-; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-SDAG-LABEL: v_exp_f32_undef:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
-; GFX900-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1
-; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_exp_f32_undef:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0
-; GFX900-GISEL-NEXT: v_rndne_f32_e32 v2, v1
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2
-; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-SDAG-LABEL: v_exp_f32_undef:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000
-; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
-; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
-; SI-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000
-; SI-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1
-; SI-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1
-; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_exp_f32_undef:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
-; SI-GISEL-LABEL: v_exp_f32_undef:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0
-; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f
-; SI-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0
-; SI-GISEL-NEXT: v_rndne_f32_e32 v2, v1
-; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0
-; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_exp_f32_undef:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp_f32_undef:
; R600: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 70c3787..edc505b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -5291,121 +5291,15 @@ define float @v_exp10_f32_dynamic_mode(float %in) #1 {
}
define float @v_exp10_f32_undef() {
-; VI-SDAG-LABEL: v_exp10_f32_undef:
-; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_rndne_f32_e32 v0, 0
-; VI-SDAG-NEXT: s_mov_b32 s4, 0x7fc00000
-; VI-SDAG-NEXT: v_add_f32_e64 v1, -v0, s4
-; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_ldexp_f32 v0, v1, v0
-; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_exp10_f32_undef:
-; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sub_f32_e64 v0, s4, 0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3a2784bc
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v0
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v3
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v1
-; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2
-; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-SDAG-LABEL: v_exp10_f32_undef:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
-; GFX900-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1
-; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_exp10_f32_undef:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0
-; GFX900-GISEL-NEXT: v_rndne_f32_e32 v2, v1
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2
-; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-SDAG-LABEL: v_exp10_f32_undef:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000
-; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
-; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
-; SI-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000
-; SI-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1
-; SI-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1
-; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_exp10_f32_undef:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
-; SI-GISEL-LABEL: v_exp10_f32_undef:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0
-; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37
-; SI-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0
-; SI-GISEL-NEXT: v_rndne_f32_e32 v2, v1
-; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4
-; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_exp10_f32_undef:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp10_f32_undef:
; R600: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 15bcab9..e71ea50 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -2783,56 +2783,10 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
}
define float @v_exp2_f32_undef() {
-; GCN-SDAG-LABEL: v_exp2_f32_undef:
-; GCN-SDAG: ; %bb.0:
-; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SDAG-NEXT: v_exp_f32_e32 v0, 0x7fc00000
-; GCN-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_exp2_f32_undef:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
-; SI-GISEL-NEXT: v_add_f32_e32 v1, s4, v1
-; SI-GISEL-NEXT: v_add_f32_e64 v2, s4, 0
-; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_exp2_f32_undef:
-; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
-; VI-GISEL-NEXT: v_add_f32_e32 v1, s4, v1
-; VI-GISEL-NEXT: v_add_f32_e64 v2, s4, 0
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_exp2_f32_undef:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
-; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s4, v1
-; GFX900-GISEL-NEXT: v_add_f32_e64 v2, s4, 0
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_exp2_f32_undef:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp2_f32_undef:
; R600: ; %bb.0:
@@ -4076,3 +4030,4 @@ attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN-GISEL: {{.*}}
+; GCN-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index f6a9fad..22bb79d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-flat-for-global < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
declare half @llvm.floor.f16(half %a)
declare <2 x half> @llvm.floor.v2f16(<2 x half> %a)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index 544941b..97ea988 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VIGFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VIGFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VIGFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VIGFX9 %s
declare half @llvm.fma.f16(half %a, half %b, half %c)
declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 61991c8..efb55db 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -1,14 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-FLUSH %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-DENORM %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-FLUSH %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DENORM %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FLUSH,GFX11-FLUSH-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FLUSH,GFX11-FLUSH-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-DENORM,GFX11-DENORM-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-DENORM,GFX11-DENORM-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global < %s | FileCheck --check-prefixes=VI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global < %s | FileCheck --check-prefixes=VI-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX10-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX10-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11-FLUSH,GFX11-FLUSH-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11-FLUSH,GFX11-FLUSH-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11-DENORM,GFX11-DENORM-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11-DENORM,GFX11-DENORM-FAKE16 %s
declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
index d411601..4f5432a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
@@ -1,14 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx704 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7CHECK,GFX7SELDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx704 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7CHECK,GFX7GLISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8CHECK,GFX8SELDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8CHECK,GFX8GLISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9CHECK %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9CHECK %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10CHECK %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10CHECK %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11CHECK %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11CHECK %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx704 < %s | FileCheck --check-prefixes=GFX7CHECK,GFX7SELDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx704 < %s | FileCheck --check-prefixes=GFX7CHECK,GFX7GLISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck --check-prefixes=GFX8CHECK,GFX8SELDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck --check-prefixes=GFX8CHECK,GFX8GLISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GFX9CHECK %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GFX9CHECK %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck --check-prefixes=GFX10CHECK %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck --check-prefixes=GFX10CHECK %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11CHECK %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11CHECK %s
define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) {
; GFX7SELDAG-LABEL: sgpr_isnan_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 5634df5..38d1b47 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -5590,162 +5590,15 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
}
define float @v_log_f32_undef() {
-; SI-SDAG-LABEL: v_log_f32_undef:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_log_f32_e32 v0, s4
-; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217
-; SI-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf
-; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1
-; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2
-; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log_f32_undef:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1
-; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
-; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0
-; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2
-; SI-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1
-; SI-GISEL-NEXT: v_add_f32_e32 v1, v2, v1
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-SDAG-LABEL: v_log_f32_undef:
-; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_log_f32_e32 v0, s4
-; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
-; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
-; VI-SDAG-NEXT: v_sub_f32_e32 v3, v0, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v3, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_log_f32_undef:
-; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-SDAG-LABEL: v_log_f32_undef:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s4
-; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217
-; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf
-; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x7f800000
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1
-; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2
-; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log_f32_undef:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2
-; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1
-; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: v_log_f32_undef:
-; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0
-; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX689-LABEL: v_log_f32_undef:
+; GFX689: ; %bb.0:
+; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX689-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1100-GISEL-LABEL: v_log_f32_undef:
-; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1100-LABEL: v_log_f32_undef:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_log_f32_undef:
; R600: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 8d1a231..058933f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -5590,162 +5590,15 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
}
define float @v_log10_f32_undef() {
-; SI-SDAG-LABEL: v_log10_f32_undef:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_log_f32_e32 v0, s4
-; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a
-; SI-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf
-; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1
-; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2
-; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log10_f32_undef:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1
-; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
-; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0
-; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2
-; SI-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1
-; SI-GISEL-NEXT: v_add_f32_e32 v1, v2, v1
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-SDAG-LABEL: v_log10_f32_undef:
-; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_log_f32_e32 v0, s4
-; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
-; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
-; VI-SDAG-NEXT: v_sub_f32_e32 v3, v0, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v3, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_log10_f32_undef:
-; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-SDAG-LABEL: v_log10_f32_undef:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s4
-; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a
-; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf
-; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x7f800000
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1
-; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2
-; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log10_f32_undef:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2
-; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1
-; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: v_log10_f32_undef:
-; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0
-; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX689-LABEL: v_log10_f32_undef:
+; GFX689: ; %bb.0:
+; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX689-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1100-GISEL-LABEL: v_log10_f32_undef:
-; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1100-LABEL: v_log10_f32_undef:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_log10_f32_undef:
; R600: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 7ca72bf..4ca612a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -3542,45 +3542,15 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
}
define float @v_log2_f32_undef() {
-; GFX689-SDAG-LABEL: v_log2_f32_undef:
-; GFX689-SDAG: ; %bb.0:
-; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-SDAG-NEXT: v_log_f32_e32 v0, s4
-; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX689-GISEL-LABEL: v_log2_f32_undef:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: v_log2_f32_undef:
-; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0
-; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX689-LABEL: v_log2_f32_undef:
+; GFX689: ; %bb.0:
+; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX689-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1100-GISEL-LABEL: v_log2_f32_undef:
-; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1100-LABEL: v_log2_f32_undef:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_log2_f32_undef:
; R600: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 863240c..de24617 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti < %s | FileCheck -enable-var-scope --check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
declare half @llvm.maxnum.f16(half %a, half %b)
declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
index 41e8762..63e9eef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) nocapture, ptr addrspace(3) nocapture, i32, i1) nounwind
declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture, i64, i1) nounwind
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 7e8c301..22f0957 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
declare half @llvm.minnum.f16(half %a, half %b)
declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 66cf8a3..6ae058b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; SI-LABEL: umulo_i64_v_v:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
index 72260e0..6e24a6a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
@@ -1,36 +1,54 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SPREFETCH,SPREFETCH-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SPREFETCH,SPREFETCH-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GFX1250,GL2-ONLY %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,GFX1250-SPREFETCH,GFX1250-SPREFETCH-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+safe-cu-prefetch < %s | FileCheck --check-prefixes=GCN,GFX1250,SAFE-CU %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,GFX12-SPREFETCH,SPREFETCH-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GFX1250,GL2-ONLY %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,GFX1250-SPREFETCH,GFX1250-SPREFETCH-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+safe-cu-prefetch < %s | FileCheck --check-prefixes=GCN,GFX1250,SAFE-CU %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,GFX12-SPREFETCH,SPREFETCH-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
; Scalar data prefetch
define amdgpu_ps void @prefetch_data_sgpr(ptr addrspace(4) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_data_sgpr:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_data_sgpr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_data_sgpr:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 1)
ret void
}
define amdgpu_ps void @prefetch_data_sgpr_offset(ptr addrspace(4) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_data_sgpr_offset:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_data_sgpr_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:512 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_data_sgpr_offset:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x200, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
%gep = getelementptr float, ptr addrspace(4) %ptr, i32 128
tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -40,14 +58,20 @@ entry:
; Check large offsets
define amdgpu_ps void @prefetch_data_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_data_sgpr_max_offset:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_data_sgpr_max_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:8388607 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_data_sgpr_max_offset:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x7fffff, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_max_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -55,6 +79,20 @@ entry:
}
define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
+; GFX1250-LABEL: prefetch_data_sgpr_min_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:-8388608 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_data_sgpr_min_offset:
+; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-SDAG-NEXT: s_mov_b64 s[2:3], lit64(0xffffffffff800000)
+; GFX1250-SPREFETCH-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm
+;
; NOSPREFETCH-LABEL: prefetch_data_sgpr_min_offset:
; NOSPREFETCH: ; %bb.0: ; %entry
; NOSPREFETCH-NEXT: s_endpgm
@@ -68,6 +106,13 @@ define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr
; SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; SPREFETCH-SDAG-NEXT: s_endpgm
;
+; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_min_offset:
+; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
+; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-GISEL-NEXT: s_endpgm
+;
; SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_min_offset:
; SPREFETCH-GISEL: ; %bb.0: ; %entry
; SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000
@@ -81,6 +126,18 @@ entry:
}
define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
+; GFX1250-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0x800000
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x800000
+; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm
+;
; NOSPREFETCH-LABEL: prefetch_data_sgpr_too_large_offset:
; NOSPREFETCH: ; %bb.0: ; %entry
; NOSPREFETCH-NEXT: s_endpgm
@@ -91,6 +148,13 @@ define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inre
; SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; SPREFETCH-SDAG-NEXT: s_endpgm
;
+; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-GISEL-NEXT: s_endpgm
+;
; SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_too_large_offset:
; SPREFETCH-GISEL: ; %bb.0: ; %entry
; SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000
@@ -105,15 +169,113 @@ entry:
; Check divergent address
-define amdgpu_ps void @prefetch_data_vgpr(ptr addrspace(1) %ptr) {
-; GCN-LABEL: prefetch_data_vgpr:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_endpgm
+define amdgpu_ps void @prefetch_data_vgpr_global(ptr addrspace(1) %ptr) {
+; GFX1250-LABEL: prefetch_data_vgpr_global:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_global:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_global:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
entry:
tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1)
ret void
}
+define amdgpu_ps void @prefetch_data_vgpr_flat(ptr %ptr) {
+; GFX1250-LABEL: prefetch_data_vgpr_flat:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_vgpr_offset_global(ptr addrspace(1) inreg %ptr, i32 %offset) {
+; GFX1250-LABEL: prefetch_data_sgpr_vgpr_offset_global:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_global:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_global:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_global:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+; GFX12-LABEL: prefetch_data_sgpr_vgpr_offset_global:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_endpgm
+; GFX11-LABEL: prefetch_data_sgpr_vgpr_offset_global:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_vgpr_offset_flat(ptr inreg %ptr, i32 %offset) {
+; GFX1250-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+; GFX12-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_endpgm
+; GFX11-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr %ptr, i32 %offset
+ %gep2 = getelementptr i8, ptr %gep1, i32 128
+ tail call void @llvm.prefetch.pf(ptr %gep2, i32 0, i32 0, i32 1)
+ ret void
+}
+
; Check LDS and Scratch, we cannot prefetch it
define amdgpu_ps void @prefetch_data_lds(ptr addrspace(3) inreg %ptr) {
@@ -137,43 +299,59 @@ entry:
; Check supported address spaces
define amdgpu_ps void @prefetch_data_sgpr_flat(ptr inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_data_sgpr_flat:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_data_sgpr_flat:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_data_sgpr_flat:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_flat:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1)
ret void
}
define amdgpu_ps void @prefetch_data_sgpr_global(ptr addrspace(1) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_data_sgpr_global:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_data_sgpr_global:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_data_sgpr_global:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_global:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1)
ret void
}
define amdgpu_ps void @prefetch_data_sgpr_constant_32bit(ptr addrspace(6) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_mov_b32 s1, 0
; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
tail call void @llvm.prefetch.p6(ptr addrspace(6) %ptr, i32 0, i32 0, i32 1)
ret void
@@ -182,28 +360,36 @@ entry:
; I$ prefetch
define amdgpu_ps void @prefetch_inst_sgpr(ptr addrspace(4) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_inst_sgpr:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_inst_sgpr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_inst_sgpr:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_inst_sgpr:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 0)
ret void
}
define amdgpu_ps void @prefetch_inst_sgpr_offset(ptr addrspace(4) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_inst_sgpr_offset:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_inst_sgpr_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_inst_sgpr_offset:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x80, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_inst_sgpr_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(4) %ptr, i32 128
tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
@@ -213,14 +399,18 @@ entry:
; Check large offsets
define amdgpu_ps void @prefetch_inst_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_inst_sgpr_max_offset:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_inst_sgpr_max_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_inst_sgpr_max_offset:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x7fffff, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_inst_sgpr_max_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
@@ -228,6 +418,18 @@ entry:
}
define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
+; GFX1250-LABEL: prefetch_inst_sgpr_min_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_inst_sgpr_min_offset:
+; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-SDAG-NEXT: s_mov_b64 s[2:3], lit64(0xffffffffff800000)
+; GFX1250-SPREFETCH-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm
+;
; NOSPREFETCH-LABEL: prefetch_inst_sgpr_min_offset:
; NOSPREFETCH: ; %bb.0: ; %entry
; NOSPREFETCH-NEXT: s_endpgm
@@ -241,6 +443,13 @@ define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr
; SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
; SPREFETCH-SDAG-NEXT: s_endpgm
;
+; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_min_offset:
+; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
+; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-GISEL-NEXT: s_endpgm
+;
; SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_min_offset:
; SPREFETCH-GISEL: ; %bb.0: ; %entry
; SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000
@@ -254,6 +463,16 @@ entry:
}
define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
+; GFX1250-LABEL: prefetch_inst_sgpr_too_large_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_inst_sgpr_too_large_offset:
+; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x800000
+; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm
+;
; NOSPREFETCH-LABEL: prefetch_inst_sgpr_too_large_offset:
; NOSPREFETCH: ; %bb.0: ; %entry
; NOSPREFETCH-NEXT: s_endpgm
@@ -264,6 +483,13 @@ define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inre
; SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
; SPREFETCH-SDAG-NEXT: s_endpgm
;
+; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_too_large_offset:
+; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-GISEL-NEXT: s_endpgm
+;
; SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_too_large_offset:
; SPREFETCH-GISEL: ; %bb.0: ; %entry
; SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000
@@ -276,6 +502,282 @@ entry:
ret void
}
+; Check cache locality
+
+define amdgpu_ps void @prefetch_data_vgpr_flat_dev(ptr %ptr) {
+; GFX1250-LABEL: prefetch_data_vgpr_flat_dev:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_dev:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_DEV
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat_dev:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat_dev:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 1, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_flat_se(ptr %ptr) {
+; GFX1250-LABEL: prefetch_data_vgpr_flat_se:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_se:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat_se:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat_se:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 2, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_flat_cu(ptr %ptr) {
+; GL2-ONLY-LABEL: prefetch_data_vgpr_flat_cu:
+; GL2-ONLY: ; %bb.0: ; %entry
+; GL2-ONLY-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
+; GL2-ONLY-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_cu:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; SAFE-CU-LABEL: prefetch_data_vgpr_flat_cu:
+; SAFE-CU: ; %bb.0: ; %entry
+; SAFE-CU-NEXT: flat_prefetch_b8 v[0:1]
+; SAFE-CU-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat_cu:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat_cu:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 3, i32 1)
+ ret void
+}
+
+; flat offset
+
+define amdgpu_ps void @prefetch_data_vgpr_flat_offset(ptr %ptr) {
+; GFX1250-LABEL: prefetch_data_vgpr_flat_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_prefetch_b8 v[0:1] offset:512 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_offset:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] offset:512 scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat_offset:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ %gep = getelementptr float, ptr %ptr, i32 128
+ tail call void @llvm.prefetch.pf(ptr %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_global_offset(ptr addrspace(1) %ptr) {
+; GFX1250-LABEL: prefetch_data_vgpr_global_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_prefetch_b8 v[0:1], off offset:512 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_offset:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v[0:1], off offset:512 scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_global_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_global_offset:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i32 128
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_global_saddr(ptr addrspace(1) inreg %ptr, i32 %voffset) {
+; GFX1250-LABEL: prefetch_data_vgpr_global_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_global_saddr:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %voffset
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_global_saddr_offset(ptr addrspace(1) inreg %ptr, i32 %voffset) {
+; GFX1250-LABEL: prefetch_data_vgpr_global_saddr_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr_offset:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_global_saddr_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr_offset:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr addrspace(1) %ptr, i32 %voffset
+ %gep2 = getelementptr i8, ptr addrspace(1) %gep1, i32 128
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep2, i32 0, i32 0, i32 1)
+ ret void
+}
+
+; Cannot prefetch I$ with flat or global instructions.
+
+define amdgpu_ps void @prefetch_inst_vgpr_global(ptr addrspace(1) %ptr) {
+; GCN-LABEL: prefetch_inst_vgpr_global:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_inst_vgpr_flat(ptr %ptr) {
+; GCN-LABEL: prefetch_inst_vgpr_flat:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 0)
+ ret void
+}
+
+; Force vector prefetch for uniform address with rw = 1 argument.
+
+define amdgpu_ps void @prefetch_data_sgpr_flat_force_vector(ptr inreg %ptr) {
+; GFX1250-LABEL: prefetch_data_sgpr_flat_force_vector:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_flat_force_vector:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_flat_force_vector:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_flat_force_vector:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 1, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_global_force_vector(ptr addrspace(1) inreg %ptr) {
+; GFX1250-LABEL: prefetch_data_sgpr_global_force_vector:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_global_force_vector:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_global_force_vector:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_global_force_vector:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 1, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_global_saddr_force_vector(ptr addrspace(1) inreg %ptr) {
+; GFX1250-LABEL: prefetch_data_sgpr_global_saddr_force_vector:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:1024 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_global_saddr_force_vector:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] offset:1024 scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_global_saddr_force_vector:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_global_saddr_force_vector:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x400, null, 0
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 1024
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep, i32 1, i32 0, i32 1)
+ ret void
+}
+
declare void @llvm.prefetch.pf(ptr nocapture readonly, i32, i32, i32)
declare void @llvm.prefetch.p1(ptr addrspace(1) nocapture readonly, i32, i32, i32)
declare void @llvm.prefetch.p3(ptr addrspace(3) nocapture readonly, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.dot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
index 3607e23..de488c5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s
+; RUN: llc -mtriple=r600 -mcpu=redwood < %s
declare float @llvm.r600.dot4(<4 x float>, <4 x float>) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
index 07010c8..e1ce776 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
declare float @llvm.r600.recipsqrt.clamped.f32(float) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
index ba261e2..8f50d94 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
declare float @llvm.r600.recipsqrt.ieee.f32(float) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index de12f2b..3d8a8a2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck --check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX89,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck --check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck --check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX12,GFX12-FAKE16 %s
declare half @llvm.rint.f16(half %a)
declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
index c6cf6f6..5bed2f3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}rint_f64:
; CI: v_rndne_f64_e32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.ll
index 58a7771..e760e8f4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
; FUNC-LABEL: {{^}}rint_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 355f77a..af914bd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -76,13 +76,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; SI-NEXT: s_movk_i32 s4, 0xfc01
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s3, 0xfffff
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bfe_u32 v4, v3, 20, 11
-; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4
+; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4
; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6
; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
; SI-NEXT: v_not_b32_e32 v5, v5
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll
new file mode 100644
index 0000000..701f54b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.sin.bf16(bfloat) #0
+
+define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 {
+; GCN-LABEL: sin_bf16_constant_4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_sin_bf16_e32 v0, 0x3f23
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: global_store_b16 v1, v0, s[0:1]
+; GCN-NEXT: s_endpgm
+ %sin = call bfloat @llvm.sin.bf16(bfloat 4.0) #0
+ store bfloat %sin, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 {
+; GCN-LABEL: sin_bf16_constant_100:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_sin_bf16_e32 v0, 0x417f
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: global_store_b16 v1, v0, s[0:1]
+; GCN-NEXT: s_endpgm
+ %sin = call bfloat @llvm.sin.bf16(bfloat 100.0) #0
+ store bfloat %sin, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 1a42609..ba03115 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX6 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope --check-prefix=GFX6 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX6-LABEL: sin_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
index 576ed27..2366e39 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
@@ -1,8 +1,8 @@
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
; FUNC-LABEL: sin_f32
; EG: MULADD_IEEE *
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index 8604feb..3e56fa3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
declare half @llvm.sqrt.f16(half %a)
declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index 0f709b0..482a7de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
declare half @llvm.trunc.f16(half %a)
declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
index 3df2627..2623d8e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
; Tests whether a load chain of 8 constants gets vectorized into a wider load.
define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) {
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 919c1df..001d748 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6-NOHSA %s
-; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7-HSA %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-NOHSA %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GFX6-NOHSA %s
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GFX7-HSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8-NOHSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
; FUNC-LABEL: {{^}}constant_load_f64:
define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 67c2ee6..bfc01ef 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
; GFX6-LABEL: constant_load_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 58a4122..4491c4b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-NOHSA-SI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-HSA %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-NOHSA-VI %s
-; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck --check-prefix=EG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN-NOHSA-SI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck --check-prefix=GCN-HSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GCN-NOHSA-VI %s
+; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
; GCN-NOHSA-SI-LABEL: constant_load_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index d86402a..0a938b0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6-NOHSA %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7-HSA %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-NOHSA %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GFX6-NOHSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GFX7-HSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8-NOHSA %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-HSA %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-HSA %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-HSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX9-HSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GFX6-NOHSA-LABEL: constant_load_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 2219cee..542b0cc 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GFX6-LABEL: constant_load_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index b1bdfa6..b39b38a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6-NOHSA %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7-HSA %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-NOHSA %s
-; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GFX6-NOHSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GFX7-HSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8-NOHSA %s
+; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; TODO: NOT AND
define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index 2c9766c..825ae80 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906 %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX803 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900-FLATSCR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca < %s | FileCheck -check-prefixes=GFX900 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -mattr=-promote-alloca < %s | FileCheck -check-prefixes=GFX906 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-promote-alloca < %s | FileCheck -check-prefixes=GFX803 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX900-FLATSCR %s
define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias %in) #0 {
; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index 0918ea48..5e5c3bc 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GFX900-MUBUF %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck --check-prefix=GFX803 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs --mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX900,GFX900-FLATSCR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca < %s | FileCheck -check-prefixes=GFX900,GFX900-MUBUF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -mattr=-promote-alloca,+sram-ecc < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-promote-alloca < %s | FileCheck --check-prefix=GFX803 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca --mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX900,GFX900-FLATSCR %s
define <2 x i16> @load_local_lo_v2i16_undeflo(ptr addrspace(3) %in) #0 {
; GFX900-LABEL: load_local_lo_v2i16_undeflo:
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-f32.ll b/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
index 61b1167..b03d395 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
; Testing for ds_read/write_128
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-f64.ll b/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
index 96b1107..60c321b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
@@ -1,12 +1,12 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-enable-ds128 < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
; Testing for ds_read_b128
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
; FUNC-LABEL: {{^}}local_load_f64:
; SICIV: s_mov_b32 m0
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i1.ll b/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
index 43d102e..9821bca 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefixes=EG,FUNC %s
; FUNC-LABEL: {{^}}local_load_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index 8b71025..8dcecfe 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=VI,VI-NO-DS128 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX9,GFX9-NO-DS128 %s
-; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=VI,VI-NO-DS128 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX9,GFX9-NO-DS128 %s
+; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG %s
; Testing for ds_read/write_b128
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=VI,VI-DS128 %s
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
index c445d2b..58e35e0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global,-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global,-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-flat-for-global,-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global,-enable-ds128 < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global,-enable-ds128 < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-flat-for-global,-enable-ds128 < %s | FileCheck -check-prefixes=GCN,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
; Testing for ds_read/write_128
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i64.ll b/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
index fe33f29..a912752 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
@@ -1,12 +1,12 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-enable-ds128 < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
; Testing for ds_read/write_b128
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
; FUNC-LABEL: {{^}}local_load_i64:
; SICIVI: s_mov_b32 m0
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
index 9731491..6851b98 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -mtriple=r600-- -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
; Testing for ds_read/write_b128
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
index 8a3cc57e..c9615f4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
; Test that checks for redundant copies to temporary stack slot produced by
; expandUnalignedLoad.
diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
index d634e40..5b6af76 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
define <4 x i32> @load_lds_v4i32(ptr addrspace(3) %ptr) {
; GFX9-LABEL: load_lds_v4i32:
diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
index b917b48..509aba4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
define <3 x i32> @load_lds_v3i32(ptr addrspace(3) %ptr) {
; GFX9-LABEL: load_lds_v3i32:
diff --git a/llvm/test/CodeGen/AMDGPU/load-range-metadata-assert.ll b/llvm/test/CodeGen/AMDGPU/load-range-metadata-assert.ll
index 209f951..a26d5d4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-range-metadata-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-range-metadata-assert.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
define <2 x i32> @range_metata_sext_range_0_i24_i64_bitcast(ptr addrspace(1) %ptr) {
; GCN-LABEL: range_metata_sext_range_0_i24_i64_bitcast:
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 9e51858..d9ad959 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
; Combine on select c, (load x), (load y) -> load (select c, x, y)
; drops MachinePointerInfo, so it can't be relied on for correctness.
diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir
new file mode 100644
index 0000000..76e2092
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir
@@ -0,0 +1,104 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-load-store-opt -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name: merge_global_load_dword_2_no_scale_offset
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: merge_global_load_dword_2_no_scale_offset
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR [[DEF]], [[DEF1]], 0, 1, implicit $exec :: (load (s64) from `ptr addrspace(1) undef` + 4, align 4, addrspace 1)
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
+ ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[COPY]]
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+ S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name: no_merge_global_load_dword_2_same_scale_offset
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: no_merge_global_load_dword_2_same_scale_offset
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 2049, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, addrspace 1)
+ ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 4, 2049, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, addrspace 1)
+ ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]]
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2049, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2049, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+ S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name: no_merge_global_load_dword_2_different_scale_offset
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: no_merge_global_load_dword_2_different_scale_offset
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, addrspace 1)
+ ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 4, 2048, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, addrspace 1)
+ ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]]
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2048, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+ S_NOP 0, implicit %1, implicit %2
+...
+
+# NB: We do not currently support merging SGPR offset and SGPR+Imm offset forms
+# of S_LOAD, but the check stays the same: these cannot be merged with different
+# scale offsets.
+#
+# We also do not currently merge flat scratch instructions, although a common
+# check in the merge logic that CPol shall not be set for merge to happen.
+
+---
+name: merge_s_load_x1_x1_imm_no_scale_offset
+body: |
+ bb.0:
+ ; GCN-LABEL: name: merge_s_load_x1_x1_imm_no_scale_offset
+ ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+ %0:sgpr_64 = IMPLICIT_DEF
+ %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
+ %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32))
+...
+
+---
+name: no_merge_s_load_x1_x1_imm_same_scale_offset
+body: |
+ bb.0:
+ ; GCN-LABEL: name: no_merge_s_load_x1_x1_imm_same_scale_offset
+ ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 2048 :: (dereferenceable invariant load (s32))
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 4, 2048 :: (dereferenceable invariant load (s32))
+ %0:sgpr_64 = IMPLICIT_DEF
+ %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 2048 :: (dereferenceable invariant load (s32))
+ %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 2048 :: (dereferenceable invariant load (s32))
+...
+
+---
+name: no_merge_s_load_x1_x1_imm_different_scale_offset
+body: |
+ bb.0:
+ ; GCN-LABEL: name: no_merge_s_load_x1_x1_imm_different_scale_offset
+ ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 4, 2048 :: (dereferenceable invariant load (s32))
+ %0:sgpr_64 = IMPLICIT_DEF
+ %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
+ %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 2048 :: (dereferenceable invariant load (s32))
+...
diff --git a/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
index 15ab2d7..59675a2 100644
--- a/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
@@ -1,6 +1,6 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI-NOHSA,SI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=FUNC,CI-HSA,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=SI-NOHSA,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefixes=SI-NOHSA,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=FUNC,CI-HSA,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=SI-NOHSA,SI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cayman < %s | FileCheck -check-prefix=FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/local-64.ll b/llvm/test/CodeGen/AMDGPU/local-64.ll
index a71418f..74a785c 100644
--- a/llvm/test/CodeGen/AMDGPU/local-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,SICIVI,CIPLUS %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI,CIPLUS %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,CIPLUS %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,SICIVI,CIPLUS %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI,CIPLUS %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,CIPLUS %s
; GCN-LABEL: {{^}}local_i32_load
; SICIVI: s_mov_b32 m0
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics.ll b/llvm/test/CodeGen/AMDGPU/local-atomics.ll
index b5f81f0..7461122 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -mtriple=r600 -mcpu=redwood -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -mtriple=r600 -mcpu=redwood -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s
; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
; EG: LDS_WRXCHG_RET *
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics64.ll b/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
index e6ce939..3bf2a4e 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI,SICIVI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SICIVI,GFX89 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI,SICIVI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SICIVI,GFX89 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX89 %s
; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64:
; SICIVI: s_mov_b32 m0
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index bcc002f..2444b2d 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s -check-prefixes=GCN,SI
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s -check-prefixes=GCN,CI
+; RUN: llc -mtriple=amdgcn -mcpu=verde --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s -check-prefixes=GCN,SI
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s -check-prefixes=GCN,CI
@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] poison, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.ll b/llvm/test/CodeGen/AMDGPU/local-memory.ll
index 6ba84b2..0453cc8 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck --check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefixes=GCN,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=FUNC %s
@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] poison, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-slot-offset.ll b/llvm/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
index 05befe9..f1bb2c1 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -mattr=-promote-alloca | FileCheck %s -check-prefix=CHECK
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca | FileCheck %s -check-prefix=CHECK
; Allocate two stack slots of 2052 bytes each requiring a total of 4104 bytes.
; Extracting the last element of each does not fit into the offset field of
diff --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
index 9b501ae..4d751f2 100644
--- a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
+++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; OBJ: Relocations [
; OBJ-NEXT: ]
diff --git a/llvm/test/CodeGen/AMDGPU/loop-idiom.ll b/llvm/test/CodeGen/AMDGPU/loop-idiom.ll
index b29092a..d712ea1 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-idiom.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-idiom.ll
@@ -1,6 +1,6 @@
; RUN: opt -passes=loop-idiom -S < %s -mtriple=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: opt -passes=loop-idiom -S < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: opt -passes=loop-idiom -S < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: opt -passes=loop-idiom -S < %s -mtriple=amdgcn -mcpu=tahiti | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: opt -passes=loop-idiom -S < %s -mtriple=amdgcn -mcpu=tonga | FileCheck --check-prefix=SI --check-prefix=FUNC %s
; Make sure loop-idiom doesn't create memcpy or memset. There are no library
; implementations of these for R600.
diff --git a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll
index dec86d4..0ce3742 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -amdgpu-enable-rewrite-partial-reg-uses=false -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -amdgpu-enable-rewrite-partial-reg-uses=false < %s | FileCheck %s
; This example used to produce a verifier error resulting from the
; register coalescer leaving behind a false live interval when a live
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index 874dece..1e6b77e 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX12 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefix=GFX12-SPREFETCH %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX1250 %s
define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
; GFX12-LABEL: copy_flat:
@@ -55,6 +56,33 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_2
; GFX12-SPREFETCH-NEXT: .LBB0_3: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
+;
+; GFX1250-LABEL: copy_flat:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_eq_u32 s6, 0
+; GFX1250-NEXT: s_cbranch_scc1 .LBB0_3
+; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GFX1250-NEXT: .LBB0_2: ; %for.body
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: flat_load_b128 v[2:5], v0, s[2:3] offset:-176
+; GFX1250-NEXT: flat_prefetch_b8 v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s6, s6, -1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX1250-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b128 v0, v[2:5], s[0:1]
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX1250-NEXT: s_cbranch_scc1 .LBB0_2
+; GFX1250-NEXT: .LBB0_3: ; %for.end
+; GFX1250-NEXT: s_endpgm
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.end, label %for.body
@@ -123,6 +151,33 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB1_2
; GFX12-SPREFETCH-NEXT: .LBB1_3: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
+;
+; GFX1250-LABEL: copy_global:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_eq_u32 s6, 0
+; GFX1250-NEXT: s_cbranch_scc1 .LBB1_3
+; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GFX1250-NEXT: .LBB1_2: ; %for.body
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: global_load_b128 v[2:5], v0, s[2:3] offset:-176
+; GFX1250-NEXT: global_prefetch_b8 v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s6, s6, -1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX1250-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v0, v[2:5], s[0:1]
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX1250-NEXT: s_cbranch_scc1 .LBB1_2
+; GFX1250-NEXT: .LBB1_3: ; %for.end
+; GFX1250-NEXT: s_endpgm
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.end, label %for.body
@@ -193,6 +248,34 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB2_2
; GFX12-SPREFETCH-NEXT: .LBB2_3: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
+;
+; GFX1250-LABEL: copy_constant:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_eq_u32 s6, 0
+; GFX1250-NEXT: s_cbranch_scc1 .LBB2_3
+; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: .LBB2_2: ; %for.body
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[2:3] offset:176 scope:SCOPE_SE
+; GFX1250-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
+; GFX1250-NEXT: s_add_co_i32 s6, s6, -1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX1250-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[8:9]
+; GFX1250-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX1250-NEXT: global_store_b128 v0, v[2:5], s[0:1]
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX1250-NEXT: s_cbranch_scc1 .LBB2_2
+; GFX1250-NEXT: .LBB2_3: ; %for.end
+; GFX1250-NEXT: s_endpgm
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.end, label %for.body
@@ -262,6 +345,29 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12-SPREFETCH-NEXT: .LBB3_2: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
+;
+; GFX1250-LABEL: copy_local:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_eq_u32 s2, 0
+; GFX1250-NEXT: s_cbranch_scc1 .LBB3_2
+; GFX1250-NEXT: .LBB3_1: ; %for.body
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v4, s0
+; GFX1250-NEXT: s_add_co_i32 s2, s2, -1
+; GFX1250-NEXT: s_add_co_i32 s0, s0, 16
+; GFX1250-NEXT: s_add_co_i32 s1, s1, 16
+; GFX1250-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
+; GFX1250-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1250-NEXT: s_wait_dscnt 0x1
+; GFX1250-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
+; GFX1250-NEXT: s_wait_dscnt 0x1
+; GFX1250-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1
+; GFX1250-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1250-NEXT: .LBB3_2: ; %for.end
+; GFX1250-NEXT: s_endpgm
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.end, label %for.body
@@ -280,3 +386,267 @@ for.body: ; preds = %entry, %for.body
for.end: ; preds = %for.body, %entry
ret void
}
+
+define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
+; GFX12-LABEL: copy_flat_divergent:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_cmp_eq_u32 s0, 0
+; GFX12-NEXT: s_cbranch_scc1 .LBB4_3
+; GFX12-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX12-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
+; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0
+; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-NEXT: .LBB4_2: ; %for.body
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176
+; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-NEXT: s_add_co_i32 s0, s0, -1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b128 v[0:1], v[4:7]
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX12-NEXT: .LBB4_3: ; %for.end
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: copy_flat_divergent:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s0, 0
+; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB4_3
+; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX12-SPREFETCH-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX12-SPREFETCH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0
+; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-SPREFETCH-NEXT: .LBB4_2: ; %for.body
+; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SPREFETCH-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe
+; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SPREFETCH-NEXT: flat_store_b128 v[0:1], v[4:7]
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX12-SPREFETCH-NEXT: .LBB4_3: ; %for.end
+; GFX12-SPREFETCH-NEXT: s_endpgm
+;
+; GFX1250-LABEL: copy_flat_divergent:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_eq_u32 s0, 0
+; GFX1250-NEXT: s_cbranch_scc1 .LBB4_3
+; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX1250-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3]
+; GFX1250-NEXT: .LBB4_2: ; %for.body
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176
+; GFX1250-NEXT: flat_prefetch_b8 v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 16, v[2:3]
+; GFX1250-NEXT: s_add_co_i32 s0, s0, -1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[4:7]
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 16, v[0:1]
+; GFX1250-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1250-NEXT: .LBB4_3: ; %for.end
+; GFX1250-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %s.tid = getelementptr inbounds <4 x i32>, ptr %s, i32 %tid
+ %d.tid = getelementptr inbounds <4 x i32>, ptr %d, i32 %tid
+ %cmp6.not = icmp eq i32 %n, 0
+ br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %idxprom = zext i32 %i.07 to i64
+ %arrayidx = getelementptr inbounds <4 x i32>, ptr %s.tid, i64 %idxprom
+ %ld = load <4 x i32>, ptr %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d.tid, i64 %idxprom
+ store <4 x i32> %ld, ptr %arrayidx2, align 4
+ %inc = add nuw i32 %i.07, 1
+ %exitcond.not = icmp eq i32 %inc, %n
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
+; GFX12-LABEL: copy_global_divergent:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_cmp_eq_u32 s0, 0
+; GFX12-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX12-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX12-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
+; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0
+; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-NEXT: .LBB5_2: ; %for.body
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176
+; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-NEXT: s_add_co_i32 s0, s0, -1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT: s_cbranch_scc1 .LBB5_2
+; GFX12-NEXT: .LBB5_3: ; %for.end
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: copy_global_divergent:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s0, 0
+; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX12-SPREFETCH-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX12-SPREFETCH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0
+; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-SPREFETCH-NEXT: .LBB5_2: ; %for.body
+; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SPREFETCH-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe
+; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12-SPREFETCH-NEXT: s_wait_loadcnt 0x0
+; GFX12-SPREFETCH-NEXT: global_store_b128 v[0:1], v[4:7], off
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB5_2
+; GFX12-SPREFETCH-NEXT: .LBB5_3: ; %for.end
+; GFX12-SPREFETCH-NEXT: s_endpgm
+;
+; GFX1250-LABEL: copy_global_divergent:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_eq_u32 s0, 0
+; GFX1250-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX1250-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3]
+; GFX1250-NEXT: .LBB5_2: ; %for.body
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176
+; GFX1250-NEXT: global_prefetch_b8 v[2:3], off scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 16, v[2:3]
+; GFX1250-NEXT: s_add_co_i32 s0, s0, -1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[0:1], v[4:7], off
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 16, v[0:1]
+; GFX1250-NEXT: s_cbranch_scc1 .LBB5_2
+; GFX1250-NEXT: .LBB5_3: ; %for.end
+; GFX1250-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %s.tid = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i32 %tid
+ %d.tid = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i32 %tid
+ %cmp6.not = icmp eq i32 %n, 0
+ br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %idxprom = zext i32 %i.07 to i64
+ %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s.tid, i64 %idxprom
+ %ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d.tid, i64 %idxprom
+ store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
+ %inc = add nuw i32 %i.07, 1
+ %exitcond.not = icmp eq i32 %inc, %n
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll
index 028758b..595a78ca 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -asm-verbose=0 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10-ASM %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s -filetype=obj | llvm-objdump -d --arch-name=amdgcn --mcpu=gfx1030 --symbolize-operands - | FileCheck --check-prefixes=GCN,GFX10,GFX10-DIS %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -asm-verbose=0 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10-ASM %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s -filetype=obj | llvm-objdump -d --arch-name=amdgcn --mcpu=gfx1030 --symbolize-operands - | FileCheck --check-prefixes=GCN,GFX10,GFX10-DIS %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
; GFX8-NOT: s_inst_prefetch
; GFX8-NOT: .palign 6
diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll
index fcae73c..3af1341 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_break.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -disable-block-placement < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -disable-block-placement < %s | FileCheck -check-prefix=GCN %s
; Uses llvm.amdgcn.break
diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
index 2864e05..a33255a 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 < %s | FileCheck -check-prefix=GCN %s
; Where the mask of lanes wanting to exit the loop on this iteration is not
; obviously already masked by exec (in this case, the xor with -1 inserted by
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 10225bb..9dac239 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -1,14 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX942,GFX942-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX942,GFX942-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s
define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, ptr addrspace(7) %out) {
; GFX9-SDAG-LABEL: buffer_nontemporal_load_store:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
index 047bdde..8281320 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
@@ -11,11 +11,13 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
; CHECK-NEXT: #dbg_value(ptr addrspace(5) [[BUF_PTR_VAR]], [[META10:![0-9]+]], !DIExpression(), [[DBG21]])
; CHECK-NEXT: [[AUX_PTR_VAR:%.*]] = alloca i160, align 32, addrspace(5), !dbg [[DBG22:![0-9]+]]
; CHECK-NEXT: #dbg_value(ptr addrspace(5) [[AUX_PTR_VAR]], [[META12:![0-9]+]], !DIExpression(), [[DBG22]])
-; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META13:![0-9]+]], !DIExpression(), [[META23:![0-9]+]])
+; CHECK-NEXT: #dbg_value(i32 0, [[META13:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[META23:![0-9]+]])
+; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF]], [[META13]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[META23]])
; CHECK-NEXT: [[BUF_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF]] to i160, !dbg [[DBG24:![0-9]+]]
; CHECK-NEXT: [[BUF_PTR_INT:%.*]] = shl nuw i160 [[BUF_PTR_INT_RSRC]], 32, !dbg [[DBG24]]
; CHECK-NEXT: store i160 [[BUF_PTR_INT]], ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG24]]
-; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META15:![0-9]+]], !DIExpression(), [[META25:![0-9]+]])
+; CHECK-NEXT: #dbg_value(i32 0, [[META15:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[META25:![0-9]+]])
+; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[AUX]], [[META15]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[META25]])
; CHECK-NEXT: [[AUX_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[AUX]] to i160, !dbg [[DBG26:![0-9]+]]
; CHECK-NEXT: [[AUX_PTR_INT:%.*]] = shl nuw i160 [[AUX_PTR_INT_RSRC]], 32, !dbg [[DBG26]]
; CHECK-NEXT: store i160 [[AUX_PTR_INT]], ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG26]]
@@ -24,10 +26,12 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128, !dbg [[DBG27]]
; CHECK-NEXT: [[BUF_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8), !dbg [[DBG27]]
; CHECK-NEXT: [[BUF_PTR_2_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_2]] to i32, !dbg [[DBG27]]
-; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META16:![0-9]+]], !DIExpression(), [[DBG27]])
+; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_2_PTR_OFF]], [[META16:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG27]])
+; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META16]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG27]])
; CHECK-NEXT: [[BUF_PTR_3_IDX:%.*]] = mul i32 [[IDX]], 4, !dbg [[DBG28:![0-9]+]]
; CHECK-NEXT: [[BUF_PTR_3:%.*]] = add i32 [[BUF_PTR_2_PTR_OFF]], [[BUF_PTR_3_IDX]], !dbg [[DBG28]]
-; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META17:![0-9]+]], !DIExpression(), [[DBG28]])
+; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_3]], [[META17:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG28]])
+; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META17]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG28]])
; CHECK-NEXT: [[BUF_PTR_3_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]] to i160, !dbg [[DBG29:![0-9]+]]
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i160 [[BUF_PTR_3_INT_RSRC]], 32, !dbg [[DBG29]]
; CHECK-NEXT: [[BUF_PTR_3_INT_OFF:%.*]] = zext i32 [[BUF_PTR_3]] to i160, !dbg [[DBG29]]
@@ -38,7 +42,8 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
; CHECK-NEXT: [[TMP5:%.*]] = trunc i160 [[TMP4]] to i128, !dbg [[DBG30]]
; CHECK-NEXT: [[BUF_PTR_4_PTR_RSRC:%.*]] = inttoptr i128 [[TMP5]] to ptr addrspace(8), !dbg [[DBG30]]
; CHECK-NEXT: [[BUF_PTR_4_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_4]] to i32, !dbg [[DBG30]]
-; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META18:![0-9]+]], !DIExpression(), [[DBG30]])
+; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_4_PTR_OFF]], [[META18:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG30]])
+; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_4_PTR_RSRC]], [[META18]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG30]])
; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF_PTR_4_PTR_RSRC]], i32 [[BUF_PTR_4_PTR_OFF]], i32 0, i32 0), !dbg [[DBG31:![0-9]+]]
; CHECK-NEXT: #dbg_value(float [[RET]], [[META19:![0-9]+]], !DIExpression(), [[DBG31]])
; CHECK-NEXT: [[AUX_PTR_2:%.*]] = load i160, ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG32:![0-9]+]]
@@ -46,7 +51,8 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
; CHECK-NEXT: [[TMP7:%.*]] = trunc i160 [[TMP6]] to i128, !dbg [[DBG32]]
; CHECK-NEXT: [[AUX_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP7]] to ptr addrspace(8), !dbg [[DBG32]]
; CHECK-NEXT: [[AUX_PTR_2_PTR_OFF:%.*]] = trunc i160 [[AUX_PTR_2]] to i32, !dbg [[DBG32]]
-; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META20:![0-9]+]], !DIExpression(), [[DBG32]])
+; CHECK-NEXT: #dbg_value(i32 [[AUX_PTR_2_PTR_OFF]], [[META20:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG32]])
+; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[AUX_PTR_2_PTR_RSRC]], [[META20]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG32]])
; CHECK-NEXT: [[BUF_PTR_4_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4]] to <5 x i32>, !dbg [[DBG33:![0-9]+]]
; CHECK-NEXT: [[BUF_PTR_4_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]]
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
index dba93a6..95e2ae9 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -check-prefix=OPT %s
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -check-prefix=GCN %s
; Check that module LDS is allocated at address 0 and kernel starts its
; allocation past module LDS when a call is present.
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index 2a7553a..b6f70fa 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=GCN %s
; Opt checks from utils/update_test_checks.py, llc checks from utils/update_llc_test_checks.py
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index dca9b71..c316f03 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefix=GCN %s
; Opt checks from utils/update_test_checks.py, llc checks from utils/update_llc_test_checks.py, both modified.
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index a62427b..2554d99 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
define amdgpu_kernel void @workgroup_ids_kernel() {
; GFX9-LABEL: workgroup_ids_kernel:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index 52b1d5e..4812898 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 < %s | FileCheck -check-prefixes=GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 < %s | FileCheck -check-prefixes=GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 < %s | FileCheck -check-prefixes=GFX9ARCH-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 < %s | FileCheck -check-prefixes=GFX9ARCH-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
define amdgpu_cs void @_amdgpu_cs_main() {
; GFX9-LABEL: _amdgpu_cs_main:
diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
index caff6c2..6e92677 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=GCN %s
define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_v1v:
diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
index 82c6584..5d98a4b0 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=pitcairn < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @zext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 %x) {
; GCN-LABEL: zext_shl64_to_32:
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 9a93b1d..68506ce 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope --check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -enable-var-scope --check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
; GFX9-LABEL: s_lshr_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
index 67138ae..41eeeaf 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-STD %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-STD %s
; Make sure we don't form mad with denormals
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-DENORM,SI-DENORM-FASTFMAF %s
-; RUN: llc -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-DENORM,SI-DENORM-SLOWFMAF %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-DENORM,SI-DENORM-FASTFMAF %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-DENORM,SI-DENORM-SLOWFMAF %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare float @llvm.fabs.f32(float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
new file mode 100644
index 0000000..11cda2d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
@@ -0,0 +1,634 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16hi_bf16hi_bf16hi_int(i32 %src0, i32 %src1, i32 %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16hi_bf16hi_bf16hi_int:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.hi = lshr i32 %src0, 16
+ %src1.hi = lshr i32 %src1, 16
+ %src2.hi = lshr i32 %src2, 16
+ %src0.i16 = trunc i32 %src0.hi to i16
+ %src1.i16 = trunc i32 %src1.hi to i16
+ %src2.i16 = trunc i32 %src2.hi to i16
+ %src0.fp16 = bitcast i16 %src0.i16 to bfloat
+ %src1.fp16 = bitcast i16 %src1.i16 to bfloat
+ %src2.fp16 = bitcast i16 %src2.i16 to bfloat
+ %src0.ext = fpext bfloat %src0.fp16 to float
+ %src1.ext = fpext bfloat %src1.fp16 to float
+ %src2.ext = fpext bfloat %src2.fp16 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16hi_bf16hi_bf16hi_elt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16hi_bf16hi_bf16hi_elt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.hi = extractelement <2 x bfloat> %src0, i32 1
+ %src1.hi = extractelement <2 x bfloat> %src1, i32 1
+ %src2.hi = extractelement <2 x bfloat> %src2, i32 1
+ %src0.ext = fpext bfloat %src0.hi to float
+ %src1.ext = fpext bfloat %src1.hi to float
+ %src2.ext = fpext bfloat %src2.hi to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define <2 x float> @v_mad_mix_v2f32(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ ret <2 x float> %result
+}
+
+define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_shuffle:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v5, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.shuf = shufflevector <2 x bfloat> %src0, <2 x bfloat> undef, <2 x i32> <i32 1, i32 0>
+ %src1.shuf = shufflevector <2 x bfloat> %src1, <2 x bfloat> undef, <2 x i32> <i32 0, i32 1>
+ %src2.shuf = shufflevector <2 x bfloat> %src2, <2 x bfloat> undef, <2 x i32> <i32 1, i32 1>
+ %src0.ext = fpext <2 x bfloat> %src0.shuf to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1.shuf to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2.shuf to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ ret <2 x float> %result
+}
+
+define float @v_mad_mix_f32_negbf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_negbf16lo_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %src0.ext.neg = fneg float %src0.ext
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext.neg, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_absbf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_absbf16lo_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext)
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext.abs, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_negabsbf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_negabsbf16lo_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext)
+ %src0.ext.neg.abs = fneg float %src0.ext.abs
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext.neg.abs, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_negf32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_negf32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, -v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.neg = fneg float %src2
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.neg)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_absf32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_absf32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, |v2| op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.abs = call float @llvm.fabs.f32(float %src2)
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.abs)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_negabsf32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_negabsf32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, -|v2| op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.abs = call float @llvm.fabs.f32(float %src2)
+ %src2.neg.abs = fneg float %src2.abs
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.neg.abs)
+ ret float %result
+}
+
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32imm1(bfloat %src0, bfloat %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32imm1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, 1.0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 1.0)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32imminv2pi(bfloat %src0, bfloat %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32imminv2pi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, 0.15915494
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 0x3FC45F3060000000)
+ ret float %result
+}
+
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imminv2pi(bfloat %src0, bfloat %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imminv2pi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, 0x3e230000
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2 = fpext bfloat 0xR3e23 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ ret float %result
+}
+
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imm63(bfloat %src0, bfloat %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imm63:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, 0x367c0000
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2 = fpext bfloat 0xR367c to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ ret float %result
+}
+
+define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x bfloat> %src0, <2 x bfloat> %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_f32imm1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 1.0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> <float 1.0, float 1.0>)
+ ret <2 x float> %result
+}
+
+define <2 x float> @v_mad_mix_v2f32_cvtbf16imminv2pi(<2 x bfloat> %src0, <2 x bfloat> %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_cvtbf16imminv2pi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX1250-NEXT: s_mov_b32 s0, 0x3e230000
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], s[0:1] op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2 = fpext <2 x bfloat> <bfloat 0xR3e23, bfloat 0xR3e23> to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2)
+ ret <2 x float> %result
+}
+
+define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x bfloat> %src0, <2 x bfloat> %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 0.15915494 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2 = fpext <2 x bfloat> <bfloat 0xR3e23, bfloat 0xR3e23> to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> <float 0x3FC45F3060000000, float 0x3FC45F3060000000>)
+ ret <2 x float> %result
+}
+
+define float @v_mad_mix_clamp_f32_bf16hi_bf16hi_bf16hi_elt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_clamp_f32_bf16hi_bf16hi_bf16hi_elt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.hi = extractelement <2 x bfloat> %src0, i32 1
+ %src1.hi = extractelement <2 x bfloat> %src1, i32 1
+ %src2.hi = extractelement <2 x bfloat> %src2, i32 1
+ %src0.ext = fpext bfloat %src0.hi to float
+ %src1.ext = fpext bfloat %src1.hi to float
+ %src2.ext = fpext bfloat %src2.hi to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %max = call float @llvm.maxnum.f32(float %result, float 0.0)
+ %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+ ret float %clamp
+}
+
+define float @no_mix_simple(float %src0, float %src1, float %src2) #0 {
+; GFX1250-LABEL: no_mix_simple:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
+ ret float %result
+}
+
+define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 {
+; GFX1250-LABEL: no_mix_simple_fabs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_f32 v0, |v0|, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.fabs = call float @llvm.fabs.f32(float %src0)
+ %result = call float @llvm.fmuladd.f32(float %src0.fabs, float %src1, float %src2)
+ ret float %result
+}
+
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals(bfloat %src0, bfloat %src1, bfloat %src2) #1 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32_denormals(bfloat %src0, bfloat %src1, float %src2) #1 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32_denormals:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals_fmulfadd(bfloat %src0, bfloat %src1, bfloat %src2) #1 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals_fmulfadd:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_mul_f32 v0, v0, v1
+; GFX1250-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %mul = fmul float %src0.ext, %src1.ext
+ %result = fadd float %mul, %src2.ext
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32_denormals_fmulfadd(bfloat %src0, bfloat %src1, float %src2) #1 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32_denormals_fmulfadd:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1250-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %mul = fmul float %src0.ext, %src1.ext
+ %result = fadd float %mul, %src2
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_flush_fmulfadd(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_flush_fmulfadd:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %mul = fmul contract float %src0.ext, %src1.ext
+ %result = fadd contract float %mul, %src2.ext
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32_flush_fmulfadd(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32_flush_fmulfadd:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %mul = fmul contract float %src0.ext, %src1.ext
+ %result = fadd contract float %mul, %src2
+ ret float %result
+}
+
+define float @v_mad_mix_f32_negprecvtbf16lo_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_negprecvtbf16lo_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+ %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 0
+ %src0.neg = fneg bfloat %src0
+ %src0.ext = fpext bfloat %src0.neg to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+
+define float @v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+ %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1
+ %src0.neg = fneg bfloat %src0
+ %src0.ext = fpext bfloat %src0.neg to float
+ %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext)
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext.abs, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_precvtabsbf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_precvtabsbf16hi_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+ %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1
+ %src0.abs = call bfloat @llvm.fabs.bf16(bfloat %src0)
+ %src0.ext = fpext bfloat %src0.abs to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_preextractfneg_bf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_preextractfneg_bf16hi_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+ %fneg = fneg <2 x bfloat> %src0.arg.bc
+ %src0 = extractelement <2 x bfloat> %fneg, i32 1
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_preextractfabs_bf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_preextractfabs_bf16hi_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+ %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %src0.arg.bc)
+ %src0 = extractelement <2 x bfloat> %fabs, i32 1
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_preextractfabsfneg_bf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_preextractfabsfneg_bf16hi_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+ %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %src0.arg.bc)
+ %fneg.fabs = fneg <2 x bfloat> %fabs
+ %src0 = extractelement <2 x bfloat> %fneg.fabs, i32 1
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half(half %src0, half %src1, half %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v3, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_fmac_f32_e32 v0, v3, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.bf16 = bitcast half %src0 to bfloat
+ %src1.bf16 = bitcast half %src1 to bfloat
+ %src2.bf16 = bitcast half %src2 to bfloat
+ %src0.ext = fpext bfloat %src0.bf16 to float
+ %src1.ext = fpext bfloat %src1.bf16 to float
+ %src2.ext = fpext bfloat %src2.bf16 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo(half %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[0,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.bf16 = bitcast half %src0 to bfloat
+ %src0.ext = fpext bfloat %src0.bf16 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define amdgpu_kernel void @test_fma_mix_f32_bf16_src2_bf16lo(float %x, i32 %y, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_fma_mix_f32_bf16_src2_bf16lo:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, s0, 0, s1 op_sel_hi:[0,0,1]
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-NEXT: s_endpgm
+entry:
+ %v0 = shl i32 %y, 16
+ %v1 = bitcast i32 %v0 to float
+ %mul7 = fmul contract float %x, 0.000000e+00
+ %add2 = fadd contract float %mul7, %v1
+ %v2 = fcmp uno float %add2, 0.000000e+00
+ %v3 = select i1 %v2, i64 1, i64 0
+ store i64 %v3, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+declare bfloat @llvm.fabs.bf16(bfloat) #2
+declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #2
+declare float @llvm.fabs.f32(float) #2
+declare float @llvm.minnum.f32(float, float) #2
+declare float @llvm.maxnum.f32(float, float) #2
+declare float @llvm.fmuladd.f32(float, float, float) #2
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #2
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #1 = { nounwind "denormal-fp-math-f32"="ieee,ieee" }
+attributes #2 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
new file mode 100644
index 0000000..5b2de59
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ %vec.result = insertelement <2 x bfloat> undef, bfloat %cvt.result, i32 1
+ ret <2 x bfloat> %vec.result
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_constlo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_constlo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v3, 0x3f80
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_fma_mixhi_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: v_mov_b32_e32 v0, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ %vec.result = insertelement <2 x bfloat> <bfloat 1.0, bfloat undef>, bfloat %cvt.result, i32 1
+ ret <2 x bfloat> %vec.result
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_reglo(bfloat %src0, bfloat %src1, bfloat %src2, bfloat %lo) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_reglo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixhi_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v0, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ %vec = insertelement <2 x bfloat> undef, bfloat %lo, i32 0
+ %vec.result = insertelement <2 x bfloat> %vec, bfloat %cvt.result, i32 1
+ ret <2 x bfloat> %vec.result
+}
+
+define i32 @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ %bc = bitcast bfloat %cvt.result to i16
+ %ext = zext i16 %bc to i32
+ %shr = shl i32 %ext, 16
+ ret i32 %shr
+}
+
+define i32 @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack_sext(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack_sext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ %bc = bitcast bfloat %cvt.result to i16
+ %ext = sext i16 %bc to i32
+ %shr = shl i32 %ext, 16
+ ret i32 %shr
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_precvt(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_precvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %max = call float @llvm.maxnum.f32(float %result, float 0.0)
+ %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+ %cvt.result = fptrunc float %clamp to bfloat
+ %vec.result = insertelement <2 x bfloat> undef, bfloat %cvt.result, i32 1
+ ret <2 x bfloat> %vec.result
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %cvt.result, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ %vec.result = insertelement <2 x bfloat> undef, bfloat %clamp, i32 1
+ ret <2 x bfloat> %vec.result
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt_multi_use(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt_multi_use:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_store_b16 v[0:1], v1, off scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ store volatile bfloat %cvt.result, ptr addrspace(1) undef
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %cvt.result, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ %vec.result = insertelement <2 x bfloat> undef, bfloat %clamp, i32 1
+ ret <2 x bfloat> %vec.result
+}
+
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat) #1
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index c0fb145..88c619e 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,SDAG-GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,SDAG-GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=SDAG-CI %s
; FIXME-TRUE16. fix gisel
-; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GISEL-GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-CI %s
+; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GISEL-GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GISEL-CI %s
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %src1, half %src2) #0 {
; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
new file mode 100644
index 0000000..557080a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
@@ -0,0 +1,540 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
+
+define bfloat @mixlo_simple(float %src0, float %src1, float %src2) #0 {
+; GFX1250-LABEL: mixlo_simple:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
+ %cvt.result = fptrunc float %result to bfloat
+ ret bfloat %cvt.result
+}
+
+define bfloat @mixlo_simpl_no_flush(float %src0, float %src1, float %src2) {
+; GFX1250-LABEL: mixlo_simpl_no_flush:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
+ %cvt.result = fptrunc float %result to bfloat
+ ret bfloat %cvt.result
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ ret bfloat %cvt.result
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo_no_flush(bfloat %src0, bfloat %src1, bfloat %src2) {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo_no_flush:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ ret bfloat %cvt.result
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ %cvt.result = fptrunc float %result to bfloat
+ ret bfloat %cvt.result
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ %cvt.result = fptrunc float %result to bfloat
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %cvt.result, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ ret bfloat %clamp
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_pre_cvt(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_pre_cvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ %max = call float @llvm.maxnum.f32(float %result, float 0.0)
+ %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+ %cvt.result = fptrunc float %clamp to bfloat
+ ret bfloat %cvt.result
+}
+
+
+define <2 x bfloat> @v_mad_mix_v2f32(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ %cvt.result = fptrunc <2 x float> %result to <2 x bfloat>
+ ret <2 x bfloat> %cvt.result
+}
+
+define <3 x bfloat> @v_mad_mix_v3f32(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v3f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: v_mov_b32_e32 v0, v6
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <3 x bfloat> %src0 to <3 x float>
+ %src1.ext = fpext <3 x bfloat> %src1 to <3 x float>
+ %src2.ext = fpext <3 x bfloat> %src2 to <3 x float>
+ %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
+ %cvt.result = fptrunc <3 x float> %result to <3 x bfloat>
+ ret <3 x bfloat> %cvt.result
+}
+
+define <4 x bfloat> @v_mad_mix_v4f32(<4 x bfloat> %src0, <4 x bfloat> %src1, <4 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v4f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v5
+; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
+; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v4 :: v_dual_lshlrev_b32 v10, 16, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[12:13]
+; GFX1250-NEXT: v_pk_fma_f32 v[2:3], v[6:7], v[8:9], v[10:11]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
+ %src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
+ %src2.ext = fpext <4 x bfloat> %src2 to <4 x float>
+ %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
+ %cvt.result = fptrunc <4 x float> %result to <4 x bfloat>
+ ret <4 x bfloat> %cvt.result
+}
+
+
+define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, 0
+; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ %cvt.result = fptrunc <2 x float> %result to <2 x bfloat>
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %cvt.result, <2 x bfloat> zeroinitializer)
+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+ ret <2 x bfloat> %clamp
+}
+
+
+define <3 x bfloat> @v_mad_mix_v3f32_clamp_postcvt(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v3f32_clamp_postcvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: v_pk_max_num_bf16 v1, v6, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_max_num_bf16 v2, v0, 0
+; GFX1250-NEXT: v_pk_min_num_bf16 v0, v1, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_min_num_bf16 v1, v2, 1.0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <3 x bfloat> %src0 to <3 x float>
+ %src1.ext = fpext <3 x bfloat> %src1 to <3 x float>
+ %src2.ext = fpext <3 x bfloat> %src2 to <3 x float>
+ %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
+ %cvt.result = fptrunc <3 x float> %result to <3 x bfloat>
+ %max = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %cvt.result, <3 x bfloat> zeroinitializer)
+ %clamp = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %max, <3 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0>)
+ ret <3 x bfloat> %clamp
+}
+
+define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bfloat> %src1, <4 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v6, 16, v0 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v10, 16, v3
+; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v4 :: v_dual_lshlrev_b32 v12, 16, v5
+; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[6:7], v[0:1], v[2:3]
+; GFX1250-NEXT: v_pk_fma_f32 v[2:3], v[8:9], v[10:11], v[12:13]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, 0
+; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
+ %src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
+ %src2.ext = fpext <4 x bfloat> %src2 to <4 x float>
+ %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
+ %cvt.result = fptrunc <4 x float> %result to <4 x bfloat>
+ %max = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %cvt.result, <4 x bfloat> zeroinitializer)
+ %clamp = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %max, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>)
+ ret <4 x bfloat> %clamp
+}
+
+define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX1250-NEXT: v_max_num_f32_e32 v1, 0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_min_num_f32_e32 v1, 1.0, v1
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ %cvt.result = fptrunc <2 x float> %result to <2 x bfloat>
+ %cvt.lo = extractelement <2 x bfloat> %cvt.result, i32 0
+ %max.lo = call bfloat @llvm.maxnum.bf16(bfloat %cvt.lo, bfloat 0.0)
+ %clamp.lo = call bfloat @llvm.minnum.bf16(bfloat %max.lo, bfloat 1.0)
+ %insert = insertelement <2 x bfloat> %cvt.result, bfloat %clamp.lo, i32 0
+ ret <2 x bfloat> %insert
+}
+
+define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX1250-NEXT: v_max_num_f32_e32 v1, 0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_min_num_f32_e32 v1, 1.0, v1
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ %cvt.result = fptrunc <2 x float> %result to <2 x bfloat>
+ %cvt.hi = extractelement <2 x bfloat> %cvt.result, i32 1
+ %max.hi = call bfloat @llvm.maxnum.bf16(bfloat %cvt.hi, bfloat 0.0)
+ %clamp.hi = call bfloat @llvm.minnum.bf16(bfloat %max.hi, bfloat 1.0)
+ %insert = insertelement <2 x bfloat> %cvt.result, bfloat %clamp.hi, i32 1
+ ret <2 x bfloat> %insert
+}
+
+
+define <2 x bfloat> @v_mad_mix_v2f32_clamp_precvt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_clamp_precvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %result, <2 x float> zeroinitializer)
+ %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>)
+ %cvt.result = fptrunc <2 x float> %clamp to <2 x bfloat>
+ ret <2 x bfloat> %cvt.result
+}
+
+
+define <3 x bfloat> @v_mad_mix_v3f32_clamp_precvt(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v3f32_clamp_precvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <3 x bfloat> %src0 to <3 x float>
+ %src1.ext = fpext <3 x bfloat> %src1 to <3 x float>
+ %src2.ext = fpext <3 x bfloat> %src2 to <3 x float>
+ %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
+ %max = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %result, <3 x float> zeroinitializer)
+ %clamp = call <3 x float> @llvm.minnum.v3f32(<3 x float> %max, <3 x float> <float 1.0, float 1.0, float 1.0>)
+ %cvt.result = fptrunc <3 x float> %clamp to <3 x bfloat>
+ ret <3 x bfloat> %cvt.result
+}
+
+define <4 x bfloat> @v_mad_mix_v4f32_clamp_precvt(<4 x bfloat> %src0, <4 x bfloat> %src1, <4 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v4f32_clamp_precvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v6, 16, v0 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v10, 16, v3
+; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v5 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_fma_f32 v[4:5], v[8:9], v[10:11], v[12:13]
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[6:7], v[0:1], v[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_max_num_f32_e64 v2, v5, v5 clamp
+; GFX1250-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
+; GFX1250-NEXT: v_max_num_f32_e64 v3, v4, v4 clamp
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v3, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
+ %src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
+ %src2.ext = fpext <4 x bfloat> %src2 to <4 x float>
+ %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
+ %max = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %result, <4 x float> zeroinitializer)
+ %clamp = call <4 x float> @llvm.minnum.v4f32(<4 x float> %max, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ %cvt.result = fptrunc <4 x float> %clamp to <4 x bfloat>
+ ret <4 x bfloat> %cvt.result
+}
+
+define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 {
+; GFX1250-LABEL: mixlo_zext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
+ %cvt.result = fptrunc float %result to bfloat
+ %cvt.result.i16 = bitcast bfloat %cvt.result to i16
+ %cvt.result.i32 = zext i16 %cvt.result.i16 to i32
+ ret i32 %cvt.result.i32
+}
+
+define bfloat @mixlo_fptrunc(float %a, float %b) #0 {
+; GFX1250-LABEL: mixlo_fptrunc:
+; GFX1250: ; %bb.0: ; %.entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, 0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+.entry:
+ %mul = fmul float %a, %b
+ %trunc = fptrunc float %mul to bfloat
+ ret bfloat %trunc
+}
+
+define bfloat @mixlo_fptrunc_no_flush(float %a, float %b) {
+; GFX1250-LABEL: mixlo_fptrunc_no_flush:
+; GFX1250: ; %bb.0: ; %.entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, 0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+.entry:
+ %mul = fmul float %a, %b
+ %trunc = fptrunc float %mul to bfloat
+ ret bfloat %trunc
+}
+
+define bfloat @mixlo_fptrunc_abs_src_mod(float %a, float %b) #0 {
+; GFX1250-LABEL: mixlo_fptrunc_abs_src_mod:
+; GFX1250: ; %bb.0: ; %.entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, |v0|, v1, 0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+.entry:
+ %a.fabs = call float @llvm.fabs.f32(float %a)
+ %mul = fmul float %a.fabs, %b
+ %trunc = fptrunc float %mul to bfloat
+ ret bfloat %trunc
+}
+
+define bfloat @mixlo_fptrunc_neg_src_mod(float %a, float %b) #0 {
+; GFX1250-LABEL: mixlo_fptrunc_neg_src_mod:
+; GFX1250: ; %bb.0: ; %.entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, -v0, v1, 0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+.entry:
+ %a.fneg = fneg float %a
+ %mul = fmul float %a.fneg, %b
+ %trunc = fptrunc float %mul to bfloat
+ ret bfloat %trunc
+}
+
+declare float @llvm.fabs.f32(float) #1
+
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat) #1
+declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>) #1
+declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>) #1
+declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>) #1
+
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat) #1
+declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>) #1
+declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>) #1
+declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>) #1
+
+declare float @llvm.minnum.f32(float, float) #1
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1
+declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #1
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1
+
+declare float @llvm.maxnum.f32(float, float) #1
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1
+declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #1
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1
+
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
+declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) #1
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 32e0d39..811e255 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -1,18 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=SDAG-CI %s
; FIXME-TRUE16. enable gisel
-; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100,GISEL-GFX1100-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100,GISEL-GFX1100-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-CI %s
+; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100,GISEL-GFX1100-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100,GISEL-GFX1100-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GISEL-CI %s
define half @mixlo_simple(float %src0, float %src1, float %src2) #0 {
; GFX1100-LABEL: mixlo_simple:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index e2170fa..a487853 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -1,20 +1,20 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100,SDAG-GFX1100-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100,SDAG-GFX1100-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic -verify-machineinstrs --amdhsa-code-object-version=6 < %s | FileCheck -check-prefixes=GFX9GEN,SDAG-GFX9GEN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,SDAG-CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100,SDAG-GFX1100-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100,SDAG-GFX1100-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 < %s | FileCheck -check-prefixes=GFX9GEN,SDAG-GFX9GEN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,SDAG-CI %s
; FIXME-TRUE16. enable gisel
-; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100,GISEL-GFX1100-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100,GISEL-GFX1100-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GEN,GISEL-GFX9GEN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,GISEL-CI %s
+; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100,GISEL-GFX1100-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100,GISEL-GFX1100-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 < %s | FileCheck -check-prefixes=GFX9GEN,GISEL-GFX9GEN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,GISEL-CI %s
define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index 9ad5626..ef80323 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefix=GCN -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefix=GCN -check-prefixes=GFX11,GFX11-FAKE16 %s
; FIXME: GFX9 should be producing v_mad_u16 instead of v_mad_legacy_u16.
diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
index 07b5e16..9d0e65b 100644
--- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
; If the workgroup id range is restricted, we should be able to use
; mad24 for the usual indexing pattern.
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 66df769..cf9a700 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX1150 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 < %s | FileCheck -check-prefixes=GFX11,GFX1150 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s
; On GFX11, ensure vdst and src2 do not partially overlap. Full overlap is ok.
diff --git a/llvm/test/CodeGen/AMDGPU/mad_int24.ll b/llvm/test/CodeGen/AMDGPU/mad_int24.ll
index eed4c2e..93fda94 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_int24.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -mtriple=amdgcn | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
diff --git a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
index ac8d7d6..eb28e6f 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -amdgpu-enable-delay-alu=0 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -< %s | FileCheck --check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -< %s | FileCheck --check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -amdgpu-enable-delay-alu=0 -< %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -< %s | FileCheck --check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 -< %s | FileCheck --check-prefixes=GCN,GFX11 %s
define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GFX9-LABEL: mad_i32_vvv:
diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
index 99d930b..a6d458e 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -1,8 +1,8 @@
; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC --check-prefix=GCN
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2
-; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2
+; RUN: llc < %s -mtriple=amdgcn | FileCheck %s --check-prefix=SI --check-prefix=FUNC --check-prefix=GCN
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2
+; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index d5188a6..9bee6bd 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-MAD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-MAD %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-MAD %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-MAD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX942-FMA %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX10-FMA %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-FMA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -fp-contract=fast < %s | FileCheck -check-prefixes=GFX942-FMA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefixes=GFX10-FMA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-FMA %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.fabs.f32(float) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/madmk.ll b/llvm/test/CodeGen/AMDGPU/madmk.ll
index 1769b74..4ef752b 100644
--- a/llvm/test/CodeGen/AMDGPU/madmk.ll
+++ b/llvm/test/CodeGen/AMDGPU/madmk.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; XUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mattr=+mad-mac-f32-insts < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; XUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; FIXME: None of these trigger madmk emission anymore. It is still
; possible, but requires the correct registers to be used which is
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
index 409b1d6..ce67a2e 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
@@ -33,7 +33,7 @@ name: asm_write_vgpr_accvgpr_write_read
body: |
bb.0:
- INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr0
+ INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0
$agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
...
@@ -47,7 +47,7 @@ name: asm_write_vgpr_accvgpr_write_read_partialnop
body: |
bb.0:
- INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr0
+ INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0
S_NOP 0
$agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
...
@@ -60,7 +60,7 @@ name: asm_write_vgpr_accvgpr_write_read_otherreg
body: |
bb.0:
liveins: $vgpr0
- INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr1
+ INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr1
$agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
...
diff --git a/llvm/test/CodeGen/AMDGPU/mai-inline.ll b/llvm/test/CodeGen/AMDGPU/mai-inline.ll
index ee57165..d0c0b9b 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/mai-inline.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN,GFX908 %s
; GCN-LABEL: {{^}}accvgpr_write_read:
; GFX908: v_accvgpr_write [[AREG:a[0-9]+]], 1
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
index 4896e50..65b4d37 100644
--- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 %s -o - | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 %s -o - | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 %s -o - | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 %s -o - | FileCheck -check-prefix=GFX11 %s
define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
; GFX9-LABEL: test:
diff --git a/llvm/test/CodeGen/AMDGPU/max-sgprs.ll b/llvm/test/CodeGen/AMDGPU/max-sgprs.ll
index 964b1ed..429e3cb 100644
--- a/llvm/test/CodeGen/AMDGPU/max-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/max-sgprs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}max_sgprs_gfx10:
; GCN: NumSgprs: 108
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index 1e24646..a5b64f6 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s --check-prefix=VI
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX9
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck %s --check-prefix=VI
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck %s --check-prefix=GFX9
; FIXME: Need to handle non-uniform case for function below (load without gep).
define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index 3d8d849..b9b29b7 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s
+; RUN: llc -mtriple=amdgcn -mcpu=pitcairn < %s | FileCheck -enable-var-scope -check-prefix=SI %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG %s
define amdgpu_kernel void @v_test_imax_sge_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
; SI-LABEL: v_test_imax_sge_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/max3.ll b/llvm/test/CodeGen/AMDGPU/max3.ll
index a757bb0..b922854 100644
--- a/llvm/test/CodeGen/AMDGPU/max3.ll
+++ b/llvm/test/CodeGen/AMDGPU/max3.ll
@@ -1,6 +1,7 @@
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_1250 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250,GFX9_1250 %s
; GCN-LABEL: {{^}}v_test_imax3_sgt_i32:
; GCN: v_max3_i32
@@ -46,7 +47,7 @@ define amdgpu_kernel void @v_test_umax3_ugt_i32(ptr addrspace(1) %out, ptr addrs
; VI: v_max_i16
; VI: v_max_i16
-; GFX9: v_max3_i16
+; GFX9_1250: v_max3_i16
define amdgpu_kernel void @v_test_imax3_sgt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
@@ -70,7 +71,7 @@ define amdgpu_kernel void @v_test_imax3_sgt_i16(ptr addrspace(1) %out, ptr addrs
; VI: v_max_u16
; VI: v_max_u16
-; GFX9: v_max3_u16
+; GFX9_1250: v_max3_u16
define amdgpu_kernel void @v_test_umax3_ugt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
@@ -94,7 +95,7 @@ define amdgpu_kernel void @v_test_umax3_ugt_i16(ptr addrspace(1) %out, ptr addrs
; VI: v_max_i16
; VI: v_max_i16
-; GFX9: v_max3_i16
+; GFX9_1250: v_max3_i16
define amdgpu_kernel void @v_test_imax3_sgt_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i8, ptr addrspace(1) %aptr, i32 %tid
@@ -118,7 +119,7 @@ define amdgpu_kernel void @v_test_imax3_sgt_i8(ptr addrspace(1) %out, ptr addrsp
; VI: v_max_u16
; VI: v_max_u16
-; GFX9: v_max3_u16
+; GFX9_1250: v_max3_u16
define amdgpu_kernel void @v_test_umax3_ugt_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i8, ptr addrspace(1) %aptr, i32 %tid
@@ -142,7 +143,7 @@ define amdgpu_kernel void @v_test_umax3_ugt_i8(ptr addrspace(1) %out, ptr addrsp
; VI: v_max_i16
; VI: v_max_i16
-; GFX9: v_max3_i16
+; GFX9_1250: v_max3_i16
define amdgpu_kernel void @v_test_imax3_sgt_i7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i7, ptr addrspace(1) %aptr, i32 %tid
@@ -166,7 +167,7 @@ define amdgpu_kernel void @v_test_imax3_sgt_i7(ptr addrspace(1) %out, ptr addrsp
; VI: v_max_u16
; VI: v_max_u16
-; GFX9: v_max3_u16
+; GFX9_1250: v_max3_u16
define amdgpu_kernel void @v_test_umax3_ugt_i7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i7, ptr addrspace(1) %aptr, i32 %tid
@@ -260,6 +261,50 @@ define amdgpu_kernel void @v_test_umax3_ugt_i64(ptr addrspace(1) %out, ptr addrs
ret void
}
+; GCN-LABEL: {{^}}v_test_imax3_sgt_v2i16:
+; SI-COUNT-2: v_max3_i32
+; VI-COUNT-2: v_max_i16
+; GFX9-COUNT-2: v_pk_max_i16
+; GFX1250: v_pk_max3_i16
+define amdgpu_kernel void @v_test_imax3_sgt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid
+ %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %bptr, i32 %tid
+ %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %cptr, i32 %tid
+ %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid
+ %a = load <2 x i16>, ptr addrspace(1) %gep0
+ %b = load <2 x i16>, ptr addrspace(1) %gep1
+ %c = load <2 x i16>, ptr addrspace(1) %gep2
+ %icmp0 = icmp sgt <2 x i16> %a, %b
+ %i0 = select <2 x i1> %icmp0, <2 x i16> %a, <2 x i16> %b
+ %icmp1 = icmp sgt <2 x i16> %i0, %c
+ %i1 = select <2 x i1> %icmp1, <2 x i16> %i0, <2 x i16> %c
+ store <2 x i16> %i1, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_imax3_ugt_v2i16:
+; SI-COUNT-2: v_max3_u32
+; VI-COUNT-2: v_max_u16
+; GFX9-COUNT-2: v_pk_max_u16
+; GFX1250: v_pk_max3_u16
+define amdgpu_kernel void @v_test_imax3_ugt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid
+ %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %bptr, i32 %tid
+ %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %cptr, i32 %tid
+ %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid
+ %a = load <2 x i16>, ptr addrspace(1) %gep0
+ %b = load <2 x i16>, ptr addrspace(1) %gep1
+ %c = load <2 x i16>, ptr addrspace(1) %gep2
+ %icmp0 = icmp ugt <2 x i16> %a, %b
+ %i0 = select <2 x i1> %icmp0, <2 x i16> %a, <2 x i16> %b
+ %icmp1 = icmp ugt <2 x i16> %i0, %c
+ %i1 = select <2 x i1> %icmp1, <2 x i16> %i0, <2 x i16> %c
+ store <2 x i16> %i1, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/med3-no-simplify.ll b/llvm/test/CodeGen/AMDGPU/med3-no-simplify.ll
index 9d29b32..1b3a626 100644
--- a/llvm/test/CodeGen/AMDGPU/med3-no-simplify.ll
+++ b/llvm/test/CodeGen/AMDGPU/med3-no-simplify.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -amdgpu-scalar-ir-passes=false < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -amdgpu-scalar-ir-passes=false < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -amdgpu-scalar-ir-passes=false < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -amdgpu-scalar-ir-passes=false < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalar-ir-passes=false < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalar-ir-passes=false < %s | FileCheck -check-prefix=GCN %s
; These tests are split out from umed3.ll and smed3.ll and use the
; -amdgpu-scalar-ir-passes=false flag, because InstSimplify would constant
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
index 173c9cc..417a4c5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -run-pass=si-memory-legalizer %s -o - | FileCheck %s
--- |
@@ -39,12 +40,7 @@
...
---
-# CHECK-LABEL: name: atomic_max_i32_noret
-# CHECK-LABEL: bb.1.atomic:
-# CHECK: BUFFER_ATOMIC_SMAX_ADDR64
-# CHECK-NEXT: S_WAITCNT_soft 3952
-# CHECK-NEXT: BUFFER_WBINVL1_VOL
name: atomic_max_i32_noret
alignment: 1
@@ -71,6 +67,46 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
+ ; CHECK-LABEL: name: atomic_max_i32_noret
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: $vgpr1 = V_ASHRREV_I32_e32 31, $vgpr0, implicit $exec
+ ; CHECK-NEXT: $vgpr1_vgpr2 = V_LSHL_B64_e64 $vgpr0_vgpr1, 3, implicit $exec
+ ; CHECK-NEXT: $sgpr7 = S_MOV_B32 61440
+ ; CHECK-NEXT: $sgpr6 = S_MOV_B32 0
+ ; CHECK-NEXT: S_WAITCNT 127
+ ; CHECK-NEXT: $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 1, 0, implicit $exec :: (volatile load (s64) from %ir.tid.gep, addrspace 1)
+ ; CHECK-NEXT: S_WAITCNT_soft 3952
+ ; CHECK-NEXT: $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec
+ ; CHECK-NEXT: V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+ ; CHECK-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: $sgpr2_sgpr3 = S_XOR_B64 $exec, killed $sgpr2_sgpr3, implicit-def dead $scc
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1.atomic:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000000C, $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr1_vgpr2_vgpr3_vgpr4:0x0000000000000003
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 15, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: dead $vgpr0 = V_MOV_B32_e32 -1, implicit $exec
+ ; CHECK-NEXT: dead $vgpr0 = V_MOV_B32_e32 61440, implicit $exec
+ ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 0
+ ; CHECK-NEXT: S_WAITCNT 127
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; CHECK-NEXT: S_WAITCNT 3952
+ ; CHECK-NEXT: S_WAITCNT_soft 3952
+ ; CHECK-NEXT: BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load syncscope("one-as") seq_cst (s32) from %ir.gep, addrspace 1)
+ ; CHECK-NEXT: S_WAITCNT_soft 3952
+ ; CHECK-NEXT: BUFFER_WBINVL1_VOL implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.exit:
+ ; CHECK-NEXT: liveins: $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $exec = S_OR_B64 $exec, killed $sgpr2_sgpr3, implicit-def $scc
+ ; CHECK-NEXT: S_ENDPGM 0
bb.0 (%ir-block.0):
successors: %bb.1.atomic(0x40000000), %bb.2.exit(0x40000000)
liveins: $vgpr0, $sgpr0_sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 1379eb6..80445f7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -79,7 +79,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -146,7 +146,7 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") release, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -218,7 +218,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -290,7 +290,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -360,7 +360,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -427,7 +427,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -499,7 +499,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -571,7 +571,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -663,7 +663,7 @@ define amdgpu_kernel void @agent_acquire_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") acquire, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -745,7 +745,7 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") release, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -843,7 +843,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -941,7 +941,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1033,7 +1033,7 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1115,7 +1115,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1213,7 +1213,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1311,7 +1311,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1405,7 +1405,7 @@ define amdgpu_kernel void @system_acquire_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
entry:
- fence acquire, !mmra !{!"amdgpu-as", !"global"}
+ fence acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1491,7 +1491,7 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence release, !mmra !{!"amdgpu-as", !"global"}
+ fence release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1595,7 +1595,7 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
entry:
- fence acq_rel, !mmra !{!"amdgpu-as", !"global"}
+ fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1699,7 +1699,7 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
entry:
- fence seq_cst, !mmra !{!"amdgpu-as", !"global"}
+ fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1793,7 +1793,7 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") acquire, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1879,7 +1879,7 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") release, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1983,7 +1983,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -2087,6 +2087,6 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
index 971015b..7a419a5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
@@ -77,7 +77,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -143,7 +143,7 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") release, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -209,7 +209,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -275,7 +275,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -332,7 +332,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -389,7 +389,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -446,7 +446,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -503,7 +503,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -571,7 +571,7 @@ define amdgpu_kernel void @agent_acquire_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") acquire, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -637,7 +637,7 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") release, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -703,7 +703,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -769,7 +769,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -826,7 +826,7 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -883,7 +883,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -940,7 +940,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -997,7 +997,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1065,7 +1065,7 @@ define amdgpu_kernel void @system_acquire_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence acquire, !mmra !{!"amdgpu-as", !"local"}
+ fence acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1131,7 +1131,7 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence release, !mmra !{!"amdgpu-as", !"local"}
+ fence release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1197,7 +1197,7 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence acq_rel, !mmra !{!"amdgpu-as", !"local"}
+ fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1263,7 +1263,7 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence seq_cst, !mmra !{!"amdgpu-as", !"local"}
+ fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1320,7 +1320,7 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") acquire, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1377,7 +1377,7 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") release, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1434,7 +1434,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1491,6 +1491,6 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
index e325071..064e3e0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
@@ -1,17 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck -check-prefix=GCN %s
---
-# GCN-LABEL: name: multiple_mem_operands
-# GCN-LABEL: bb.3:
-# GCN: S_WAITCNT_soft 3952
-# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
-# GCN-NEXT: S_WAITCNT_soft 3952
-# GCN-NEXT: BUFFER_WBINVL1_VOL
name: multiple_mem_operands
body: |
+ ; GCN-LABEL: name: multiple_mem_operands
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x30000000), %bb.1(0x50000000)
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+ ; GCN-NEXT: $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GCN-NEXT: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, addrspace 4)
+ ; GCN-NEXT: $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GCN-NEXT: $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GCN-NEXT: $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) poison`, addrspace 5)
+ ; GCN-NEXT: S_WAITCNT 127
+ ; GCN-NEXT: S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc
+ ; GCN-NEXT: S_WAITCNT 3855
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 2, implicit $exec
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) poison`, addrspace 5)
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.3(0x80000000)
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+ ; GCN-NEXT: S_WAITCNT 3855
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 32772, implicit $exec
+ ; GCN-NEXT: S_BRANCH %bb.3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.3(0x80000000)
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+ ; GCN-NEXT: S_WAITCNT 3855
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 4, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: liveins: $sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAITCNT 127
+ ; GCN-NEXT: $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc
+ ; GCN-NEXT: $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec
+ ; GCN-NEXT: S_WAITCNT_soft 3952
+ ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 1, 0, implicit $exec :: (load syncscope("agent-one-as") unordered (s32) from `ptr addrspace(1) poison`, addrspace 1), (load syncscope("workgroup-one-as") seq_cst (s32) from `ptr addrspace(5) poison`, addrspace 5)
+ ; GCN-NEXT: S_WAITCNT_soft 3952
+ ; GCN-NEXT: BUFFER_WBINVL1_VOL implicit $exec
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 3952
+ ; GCN-NEXT: FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+ ; GCN-NEXT: S_ENDPGM 0
bb.0.entry:
successors: %bb.1(0x30000000), %bb.2(0x50000000)
liveins: $sgpr0_sgpr1, $sgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 2bda61a..ad12d02 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx902 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SCRATCH %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx902 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GCN-SCRATCH %s
define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) {
; GCN-LABEL: vector_clause:
diff --git a/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.ll b/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.ll
index 530ff67..4dbd3e2 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
@L = external local_unnamed_addr addrspace(3) global [9 x double], align 16
@Ldisp = external local_unnamed_addr addrspace(3) global [96 x double], align 16
diff --git a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
index bda2ceb..d9c64a3 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s
; This is used to crash in LiveIntervalAnalysis via SILoadStoreOptimizer
; while fixing up the merge of two ds_write instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
index ae4fd66..6b150ad 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s
; CHECK-LABEL: {{^}}test1:
; CHECK: ds_write_b32
diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
index 2960768..2e9d1b4 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefixes=GCN,CI %s
; This test is mostly to test DAG store merging, so disable the vectorizer.
; Run with devices with different unaligned load restrictions.
diff --git a/llvm/test/CodeGen/AMDGPU/mesa3d.ll b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
index 0460f83..6066fdd 100644
--- a/llvm/test/CodeGen/AMDGPU/mesa3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11PLUS %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11PLUS %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefix=GFX11PLUS %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefix=GFX11PLUS %s
; SPI_TMPRING_SIZE.WAVESIZE = 5
; GFX10: .long 165608
diff --git a/llvm/test/CodeGen/AMDGPU/mesa_regression.ll b/llvm/test/CodeGen/AMDGPU/mesa_regression.ll
index 4b669ac..653edda 100644
--- a/llvm/test/CodeGen/AMDGPU/mesa_regression.ll
+++ b/llvm/test/CodeGen/AMDGPU/mesa_regression.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O2 -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=false -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O2 -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=false < %s | FileCheck %s
; CHECK-LABEL: %entry
; CHECK: flat_load_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll
index 1c03285..077529c 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
index 368ab0b..6763957 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 244b68c..6110b31 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
; Check that we do not copy agprs to vgprs and back inside the loop.
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index 21af2dd..e6d7b14 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY942 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY90A-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=FAST90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY90A-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck -enable-var-scope --check-prefixes=FAST90A %s
; This is better with 90a
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx942.ll b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx942.ll
index e313680..02e08ee 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx942.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32)
declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll
index 0d1ea35..1c7e2e9 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll b/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll
index 04f2e32..207aaaa 100644
--- a/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll
+++ b/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=WARN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s 2>&1 | FileCheck -check-prefix=WARN %s
; 1024 flat work group size across 2560 possible threads -> occupancy should be 8 max.
; WARN: warning: <unknown>:0:0: failed to meet occupancy target given by 'amdgpu-waves-per-eu' in 'occupancy_8_target_9': desired occupancy was 9, final occupancy is 8
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 05ffaf6..bf2ddc1 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck --check-prefix=EG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck --check-prefix=EG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck --check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; EG-LABEL: v_test_imin_sle_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/min3.ll b/llvm/test/CodeGen/AMDGPU/min3.ll
index 0e25540..e30b929 100644
--- a/llvm/test/CodeGen/AMDGPU/min3.ll
+++ b/llvm/test/CodeGen/AMDGPU/min3.ll
@@ -1,6 +1,7 @@
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_1250 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250,GFX9_1250 %s
; GCN-LABEL: {{^}}v_test_imin3_slt_i32:
; GCN: v_min3_i32
@@ -116,7 +117,7 @@ define amdgpu_kernel void @v_test_umin3_2_uses(ptr addrspace(1) %out, ptr addrsp
; VI: v_min_i16
; VI: v_min_i16
-; GFX9: v_min3_i16
+; GFX9_1250: v_min3_i16
define amdgpu_kernel void @v_test_imin3_slt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
@@ -140,7 +141,7 @@ define amdgpu_kernel void @v_test_imin3_slt_i16(ptr addrspace(1) %out, ptr addrs
; VI: v_min_u16
; VI: v_min_u16
-; GFX9: v_min3_u16
+; GFX9_1250: v_min3_u16
define amdgpu_kernel void @v_test_umin3_ult_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
@@ -164,7 +165,7 @@ define amdgpu_kernel void @v_test_umin3_ult_i16(ptr addrspace(1) %out, ptr addrs
; VI: v_min_i16
; VI: v_min_i16
-; GFX9: v_min3_i16
+; GFX9_1250: v_min3_i16
define amdgpu_kernel void @v_test_imin3_slt_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i8, ptr addrspace(1) %aptr, i32 %tid
@@ -188,7 +189,7 @@ define amdgpu_kernel void @v_test_imin3_slt_i8(ptr addrspace(1) %out, ptr addrsp
; VI: v_min_u16
; VI: v_min_u16
-; GFX9: v_min3_u16
+; GFX9_1250: v_min3_u16
define amdgpu_kernel void @v_test_umin3_ult_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i8, ptr addrspace(1) %aptr, i32 %tid
@@ -212,7 +213,7 @@ define amdgpu_kernel void @v_test_umin3_ult_i8(ptr addrspace(1) %out, ptr addrsp
; VI: v_min_i16
; VI: v_min_i16
-; GFX9: v_min3_i16
+; GFX9_1250: v_min3_i16
define amdgpu_kernel void @v_test_imin3_slt_i7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i7, ptr addrspace(1) %aptr, i32 %tid
@@ -236,7 +237,7 @@ define amdgpu_kernel void @v_test_imin3_slt_i7(ptr addrspace(1) %out, ptr addrsp
; VI: v_min_u16
; VI: v_min_u16
-; GFX9: v_min3_u16
+; GFX9_1250: v_min3_u16
define amdgpu_kernel void @v_test_umin3_ult_i7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i7, ptr addrspace(1) %aptr, i32 %tid
@@ -330,6 +331,50 @@ define amdgpu_kernel void @v_test_umin3_ult_i64(ptr addrspace(1) %out, ptr addrs
ret void
}
+; GCN-LABEL: {{^}}v_test_imin3_slt_v2i16:
+; SI-COUNT-2: v_min3_i32
+; VI-COUNT-2: v_min_i16
+; GFX9-COUNT-2: v_pk_min_i16
+; GFX1250: v_pk_min3_i16
+define amdgpu_kernel void @v_test_imin3_slt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+ %gep1 = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid
+ %gep2 = getelementptr i32, ptr addrspace(1) %cptr, i32 %tid
+ %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid
+ %a = load <2 x i16>, ptr addrspace(1) %gep0
+ %b = load <2 x i16>, ptr addrspace(1) %gep1
+ %c = load <2 x i16>, ptr addrspace(1) %gep2
+ %icmp0 = icmp slt <2 x i16> %a, %b
+ %i0 = select <2 x i1> %icmp0, <2 x i16> %a, <2 x i16> %b
+ %icmp1 = icmp slt <2 x i16> %i0, %c
+ %i1 = select <2 x i1> %icmp1, <2 x i16> %i0, <2 x i16> %c
+ store <2 x i16> %i1, ptr addrspace(1) %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_imin3_ult_v2i16:
+; SI-COUNT-2: v_min3_u32
+; VI-COUNT-2: v_min_u16
+; GFX9-COUNT-2: v_pk_min_u16
+; GFX1250: v_pk_min3_u16
+define amdgpu_kernel void @v_test_imin3_ult_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+ %gep1 = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid
+ %gep2 = getelementptr i32, ptr addrspace(1) %cptr, i32 %tid
+ %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid
+ %a = load <2 x i16>, ptr addrspace(1) %gep0
+ %b = load <2 x i16>, ptr addrspace(1) %gep1
+ %c = load <2 x i16>, ptr addrspace(1) %gep2
+ %icmp0 = icmp ult <2 x i16> %a, %b
+ %i0 = select <2 x i1> %icmp0, <2 x i16> %a, <2 x i16> %b
+ %icmp1 = icmp ult <2 x i16> %i0, %c
+ %i1 = select <2 x i1> %icmp1, <2 x i16> %i0, <2 x i16> %c
+ store <2 x i16> %i1, ptr addrspace(1) %outgep
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
index 3614831..4f33b63 100644
--- a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-FAKE16 %s
define amdgpu_ps float @test_minmax_f32(float %a, float %b, float %c) {
; GFX12-LABEL: test_minmax_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index bdd8935..3702f32 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11,SDAG-GFX11-TRUE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11,SDAG-GFX11-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12,SDAG-GFX12-TRUE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12,SDAG-GFX12-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12,GISEL-GFX12-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12,GISEL-GFX12-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11,SDAG-GFX11-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11,SDAG-GFX11-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12,SDAG-GFX12-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12,SDAG-GFX12-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12,GISEL-GFX12-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12,GISEL-GFX12-FAKE16 %s
define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_minmax_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll
index 4f066fd..c42c7c3 100644
--- a/llvm/test/CodeGen/AMDGPU/missing-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
@ptr_load = addrspace(3) global ptr addrspace(4) poison, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
index 0f67a40..71900a4 100644
--- a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perShaderTable, i32 inreg %descTable0, i32 inreg %descTable1, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 {
; GFX11-LABEL: mixed_vmem_types:
diff --git a/llvm/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll b/llvm/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll
index 964ea58..aba14c3 100644
--- a/llvm/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll
+++ b/llvm/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck --check-prefix=GCN %s
; GCN-LABEL: _amdgpu_hs_main:
diff --git a/llvm/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll b/llvm/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll
index 5977566..6b1d9eb 100644
--- a/llvm/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll
+++ b/llvm/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN %s
;
; Check that PS is wave64
; GCN-LABEL: _amdgpu_ps_main:
diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
index 15f93f1..05ff5c8 100644
--- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
+++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX10 %s
; Test case looks at the allocated offset of @used_by_both. It's at zero when
; allocated by itself, but at 8 when allocated in combination with the double.
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
index 4e89a16..a7b4ba8 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies < %s | FileCheck %s
define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) {
; CHECK-LABEL: name: add_reg_imm
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
index fab5d38..60f77bd 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; XUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
+; XUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck --check-prefixes=GCN,VI %s
; FIXME: broken on VI because flat instructions need to be emitted
; instead of addr64 equivalent of the _OFFSET variants.
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
index 7eb4463..fcc5584 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; XUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
+; XUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck --check-prefixes=GCN,VI %s
; FIXME: broken on VI because flat instructions need to be emitted
; instead of addr64 equivalent of the _OFFSET variants.
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
index 6dbfebfd..30ad3be 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
index 2870af1..f7fb4a6 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -stop-after=si-fix-sgpr-copies < %s | FileCheck %s
define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
; CHECK-LABEL: name: exp_f16
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
index c93eb1d..3768634 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -stop-after=si-fix-sgpr-copies < %s | FileCheck %s
define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
; CHECK-LABEL: name: exp_f16
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
index 56848ea..d6b0958 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies < %s | FileCheck %s
define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) {
; CHECK-LABEL: name: exp_f32
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
index 91964ab..0f4715f 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck --check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -stop-after=si-fix-sgpr-copies < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies < %s | FileCheck --check-prefix=GFX12 %s
define amdgpu_ps float @vimage_move_to_valu(<8 x i32> %rsrc) {
; GFX11-LABEL: name: vimage_move_to_valu
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-worklist.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-worklist.ll
index a487650..9377387 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-worklist.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-worklist.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck --check-prefix=GCN %s
; In moveToVALU(), move to vector ALU is performed, all instrs in
-; the use chain will be visited. We do not want the same node to be
+; the use chain will be visited. We do not want the same node to be
; pushed to the visit worklist more than once.
-
+
; GCN-LABEL: {{^}}in_worklist_once:
; GCN: buffer_load_dword
; GCN: BB0_1:
diff --git a/llvm/test/CodeGen/AMDGPU/movreld-bug.ll b/llvm/test/CodeGen/AMDGPU/movreld-bug.ll
index e2deac2..5bb9f2b 100644
--- a/llvm/test/CodeGen/AMDGPU/movreld-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/movreld-bug.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=GCN,MOVREL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,MOVREL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
; GCN-LABEL: {{^}}main:
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index 8426224..e12fe97 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx900 -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
; Uses the old forms of the buffer intrinsics that don't take pointer arguments.
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 1480743..3d3c59f 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx900 -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr-non-ptr-intrinsics.ll
index b16bd04..3acd1b0 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr-non-ptr-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=CHECK
; Test that buffer_load_format with VGPR resource descriptor is properly
; legalized.
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll b/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
index 796852e..a548353 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=CHECK
; Test that buffer_load_format with VGPR resource descriptor is properly
; legalized.
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll
index dd9f5fa..2f59d75 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll
@@ -1,4 +1,4 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -show-mc-encoding < %s | FileCheck %s
;;;==========================================================================;;;
;;; MUBUF LOAD TESTS
diff --git a/llvm/test/CodeGen/AMDGPU/mul.i16.ll b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
index ba4c29e..f8cce6e 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
@@ -1,8 +1,8 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11-FAKE16 %s
; GCN-LABEL: {{^}}v_mul_i16:
; SI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index b5e7589..8d3716e 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -1,10 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1250 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX1250 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
; mul24 and mad24 are affected
@@ -124,6 +125,25 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: test_mul_v2i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v3
+; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v2
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: test_mul_v2i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -286,6 +306,29 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul_v4i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
+; GFX1250-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u32 v3, v3, v7
+; GFX1250-NEXT: v_mul_lo_u32 v2, v2, v6
+; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v5
+; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v4
+; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_v4i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
@@ -402,6 +445,19 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a,
; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: s_trunc_i64_mul_to_i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mul_i32 s2, s3, s2
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: s_trunc_i64_mul_to_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -555,6 +611,29 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_trunc_i64_mul_to_i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_mov_b32 s10, -1
+; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s14, s10
+; GFX1250-NEXT: s_mov_b32 s15, s11
+; GFX1250-NEXT: s_mov_b32 s6, s10
+; GFX1250-NEXT: s_mov_b32 s7, s11
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s12, s2
+; GFX1250-NEXT: s_mov_b32 s13, s3
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null
+; GFX1250-NEXT: buffer_load_b32 v1, off, s[4:7], null
+; GFX1250-NEXT: s_mov_b32 s8, s0
+; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u32 v0, v1, v0
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_trunc_i64_mul_to_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -670,6 +749,19 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: mul64_sext_c:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: mul64_sext_c:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -773,6 +865,18 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: mul64_zext_c:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: mul64_zext_c:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
@@ -909,6 +1013,26 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul64_sext_c:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mul_u64_e32 v[0:1], 0x50, v[0:1]
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul64_sext_c:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1052,6 +1176,25 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul64_zext_c:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mul_u64_e32 v[0:1], 0x50, v[0:1]
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul64_zext_c:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1192,6 +1335,26 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul64_sext_inline_imm:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mul_u64_e32 v[0:1], 9, v[0:1]
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul64_sext_inline_imm:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1300,6 +1463,20 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: s_mul_i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4c
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x70
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mul_i32 s2, s2, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: s_mul_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
@@ -1425,6 +1602,24 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul_i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -1540,6 +1735,22 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: s_mul_i1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4c
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x70
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_and_b32 s2, s2, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b8 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: s_mul_i1:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
@@ -1699,6 +1910,28 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul_i1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s10, s6
+; GFX1250-NEXT: s_mov_b32 s11, s7
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s8, s2
+; GFX1250-NEXT: s_mov_b32 s9, s3
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: buffer_load_u8 v0, off, s[8:11], null
+; GFX1250-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4
+; GFX1250-NEXT: s_mov_b32 s4, s0
+; GFX1250-NEXT: s_mov_b32 s5, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1250-NEXT: buffer_store_b8 v0, off, s[4:7], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_i1:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
@@ -1856,6 +2089,19 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: s_mul_i64:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5]
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: s_mul_i64:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
@@ -2044,6 +2290,29 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul_i64:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_mov_b32 s10, -1
+; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s14, s10
+; GFX1250-NEXT: s_mov_b32 s15, s11
+; GFX1250-NEXT: s_mov_b32 s6, s10
+; GFX1250-NEXT: s_mov_b32 s7, s11
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s12, s2
+; GFX1250-NEXT: s_mov_b32 s13, s3
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null
+; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[4:7], null
+; GFX1250-NEXT: s_mov_b32 s8, s0
+; GFX1250-NEXT: s_mov_b32 s9, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_i64:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
@@ -2286,6 +2555,41 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: mul32_in_branch:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
+; GFX1250-NEXT: s_mov_b32 s6, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: s_cbranch_scc0 .LBB15_2
+; GFX1250-NEXT: ; %bb.1: ; %else
+; GFX1250-NEXT: s_mul_i32 s7, s0, s1
+; GFX1250-NEXT: s_branch .LBB15_3
+; GFX1250-NEXT: .LBB15_2:
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: ; implicit-def: $sgpr7
+; GFX1250-NEXT: .LBB15_3: ; %Flow
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
+; GFX1250-NEXT: s_cbranch_vccnz .LBB15_5
+; GFX1250-NEXT: ; %bb.4: ; %if
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s4, s2
+; GFX1250-NEXT: s_mov_b32 s5, s3
+; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null
+; GFX1250-NEXT: s_branch .LBB15_6
+; GFX1250-NEXT: .LBB15_5:
+; GFX1250-NEXT: v_mov_b32_e32 v0, s7
+; GFX1250-NEXT: .LBB15_6: ; %endif
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: mul32_in_branch:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[]
@@ -2539,6 +2843,34 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: mul64_in_branch:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1250-NEXT: s_cbranch_scc0 .LBB16_3
+; GFX1250-NEXT: ; %bb.1: ; %else
+; GFX1250-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX1250-NEXT: s_cbranch_execnz .LBB16_4
+; GFX1250-NEXT: .LBB16_2: ; %if
+; GFX1250-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_mov_b32 s4, s2
+; GFX1250-NEXT: s_mov_b32 s5, s3
+; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT: s_branch .LBB16_5
+; GFX1250-NEXT: .LBB16_3:
+; GFX1250-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX1250-NEXT: s_branch .LBB16_2
+; GFX1250-NEXT: .LBB16_4:
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT: .LBB16_5: ; %endif
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: mul64_in_branch:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[]
@@ -2882,6 +3214,52 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: s_mul_i128:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x7c
+; GFX1250-NEXT: s_load_b128 s[12:15], s[4:5], 0x4c
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b64 s[4:5], lit64(0xffffffff)
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mov_b32 s7, s3
+; GFX1250-NEXT: s_mov_b32 s17, s3
+; GFX1250-NEXT: s_mov_b32 s19, s3
+; GFX1250-NEXT: s_mov_b32 s20, s3
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s2, s8
+; GFX1250-NEXT: s_and_b64 s[4:5], s[12:13], s[4:5]
+; GFX1250-NEXT: s_mov_b32 s6, s13
+; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[12:13]
+; GFX1250-NEXT: s_mul_u64 s[12:13], s[4:5], s[2:3]
+; GFX1250-NEXT: s_mov_b32 s16, s9
+; GFX1250-NEXT: s_mul_u64 s[8:9], s[8:9], s[14:15]
+; GFX1250-NEXT: s_mul_u64 s[14:15], s[6:7], s[2:3]
+; GFX1250-NEXT: s_mov_b32 s2, s13
+; GFX1250-NEXT: s_mul_u64 s[4:5], s[4:5], s[16:17]
+; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[14:15], s[2:3]
+; GFX1250-NEXT: s_mul_u64 s[6:7], s[6:7], s[16:17]
+; GFX1250-NEXT: s_mov_b32 s2, s15
+; GFX1250-NEXT: s_mov_b32 s15, s3
+; GFX1250-NEXT: s_mov_b32 s13, s3
+; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[14:15]
+; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[10:11], s[8:9]
+; GFX1250-NEXT: s_mov_b32 s18, s5
+; GFX1250-NEXT: s_mov_b32 s21, s4
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19]
+; GFX1250-NEXT: s_or_b64 s[4:5], s[12:13], s[20:21]
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[6:7], s[2:3]
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[8:9]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT: s_mov_b32 s2, -1
+; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: s_mul_i128:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 41, @4, KC0[CB0:0-32], KC1[]
@@ -3159,6 +3537,43 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3]
; GFX12-NEXT: s_endpgm
;
+; GFX1250-LABEL: v_mul_i128:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX1250-NEXT: v_and_b32_e32 v16, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_load_b128 v[0:3], v16, s[2:3] scale_offset
+; GFX1250-NEXT: global_load_b128 v[4:7], v16, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v10, v0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v8, v4
+; GFX1250-NEXT: v_mul_u64_e32 v[6:7], v[0:1], v[6:7]
+; GFX1250-NEXT: v_mul_lo_u32 v3, v3, v4
+; GFX1250-NEXT: v_mul_u64_e32 v[8:9], v[8:9], v[10:11]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], null, v2, v4, v[6:7]
+; GFX1250-NEXT: v_mul_lo_u32 v2, v2, v5
+; GFX1250-NEXT: v_mov_b32_e32 v10, v9
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], null, v5, v0, v[10:11]
+; GFX1250-NEXT: v_add3_u32 v7, v3, v7, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v10, v13 :: v_dual_mov_b32 v13, v11
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], null, v4, v1, v[12:13]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v9, v12
+; GFX1250-NEXT: v_mov_b32_e32 v14, v13
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[14:15]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v1, v[10:11]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[10:11], v[0:1], v[6:7]
+; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[2:3] scale_offset
+; GFX1250-NEXT: s_endpgm
+;
; EG-LABEL: v_mul_i128:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
@@ -3271,6 +3686,13 @@ define i32 @mul_pow2_plus_1(i32 %val) {
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, v0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1250-LABEL: mul_pow2_plus_1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 3, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; EG-LABEL: mul_pow2_plus_1:
; EG: ; %bb.0:
; EG-NEXT: CF_END
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 803cae4..f4e5c27 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GFX9 %s
; Make sure that AMDGPUCodeGenPrepare introduces mul24 intrinsics
; after SLSR, as the intrinsics would interfere. It's unclear if these
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 4377e75..bf8994e 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
; RUN: llc -mtriple=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 864bc0b..1870d1b 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 42c6589..d6cc833 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -1,6 +1,6 @@
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx600 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
; Add an extra verifier runs. There were some cases where invalid IR
; was produced but happened to be fixed by the later passes.
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 83dd442..1fad8f3 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mtriple=amdgcn-- -lowerswitch -structurizecfg -si-annotate-control-flow < %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; Ensure two if.break calls, for both the inner and outer loops
; FIXME: duplicate comparison
diff --git a/llvm/test/CodeGen/AMDGPU/nand.ll b/llvm/test/CodeGen/AMDGPU/nand.ll
index ad5bfcb..781ce34 100644
--- a/llvm/test/CodeGen/AMDGPU/nand.ll
+++ b/llvm/test/CodeGen/AMDGPU/nand.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GCN %s
; GCN-LABEL: {{^}}scalar_nand_i32_one_use
; GCN: s_nand_b32
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index f30a04a5..65446a0 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 < %s | FileCheck %s
; FP is in CSR range, modified.
define hidden fastcc void @callee_has_fp() #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index 1821872..ccaf0ac 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIJI %s
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HAWAII %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIJI %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HAWAII %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; Test calls when called by other callable functions rather than
; kernels.
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 9a2d969..5ce30cb 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=GCN %s
; After structurizing, there are 3 levels of loops. The i1 phi
; conditions mutually depend on each other, so it isn't safe to delete
diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
index f43ca4f..306703b 100644
--- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX10-LABEL: _amdgpu_cs_main:
diff --git a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
index c6b1fe8..afb289b 100644
--- a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index 25b7b043..e6243f0 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 < %s | FileCheck %s
; Test that source locations (.loc directives) are not added to the code within the prologue.
diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
index 944951d..88cc06d 100644
--- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=amdgcn -mcpu=gfx900 -amdgpu-aa -amdgpu-aa-wrapper -amdgpu-annotate-uniform -S < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefix=GCN %s
; Check that barrier or fence in between of loads is not considered a clobber
; for the purpose of converting vector loads into scalar.
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 2bdacce..cfe7315 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=MUBUF,DEFAULTSIZE %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=MUBUF,DEFAULTSIZE-V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=MUBUF,DEFAULTSIZE-V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=MUBUF,ASSUME1024 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=MUBUF,ASSUME1024 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch | FileCheck -check-prefixes=FLATSCR,DEFAULTSIZE %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=FLATSCR,ASSUME1024 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=MUBUF,DEFAULTSIZE %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=MUBUF,DEFAULTSIZE-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=MUBUF,DEFAULTSIZE-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=MUBUF,ASSUME1024 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=MUBUF,ASSUME1024 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+enable-flat-scratch | FileCheck -check-prefixes=FLATSCR,DEFAULTSIZE %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=FLATSCR,ASSUME1024 %s
; FIXME: Generated test checks do not check metadata at the end of the
; function, so this also includes manually added checks.
diff --git a/llvm/test/CodeGen/AMDGPU/noop-shader-O0.ll b/llvm/test/CodeGen/AMDGPU/noop-shader-O0.ll
index dce1a7f..88543c3 100644
--- a/llvm/test/CodeGen/AMDGPU/noop-shader-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/noop-shader-O0.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; Ensure NOOP shaders compile at OptNone.
diff --git a/llvm/test/CodeGen/AMDGPU/nor.ll b/llvm/test/CodeGen/AMDGPU/nor.ll
index 530a6e0..886605c 100644
--- a/llvm/test/CodeGen/AMDGPU/nor.ll
+++ b/llvm/test/CodeGen/AMDGPU/nor.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GCN %s
; GCN-LABEL: {{^}}scalar_nor_i32_one_use
; GCN: s_nor_b32
diff --git a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll
index ff80af3..4546d6c 100644
--- a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll
+++ b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-xnack -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-xnack -enable-misched=0 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}sample_contig_nsa:
; GCN-DAG: image_sample_c_l v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}],
diff --git a/llvm/test/CodeGen/AMDGPU/nullptr.ll b/llvm/test/CodeGen/AMDGPU/nullptr.ll
index 5a736aa..1552014 100644
--- a/llvm/test/CodeGen/AMDGPU/nullptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/nullptr.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck -check-prefixes=CHECK,GCN %s
-;RUN: llc < %s -mtriple=r600-- -verify-machineinstrs | FileCheck -check-prefixes=CHECK,R600 %s
+;RUN: llc < %s -mtriple=amdgcn-- | FileCheck -check-prefixes=CHECK,GCN %s
+;RUN: llc < %s -mtriple=r600-- | FileCheck -check-prefixes=CHECK,R600 %s
%struct.S = type { ptr addrspace(5), ptr addrspace(1), ptr addrspace(4), ptr addrspace(3), ptr, ptr addrspace(2)}
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index 61ac1fe..d95fc77 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
; Test splitting flat instruction offsets into the low and high bits
; when the offset doesn't fit in the offset field.
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index de5f4f9..20916a9 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
; Test splitting flat instruction offsets into the low and high bits
; when the offset doesn't fit in the offset field.
diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll
index c1ae681..9371ce5 100644
--- a/llvm/test/CodeGen/AMDGPU/omod.ll
+++ b/llvm/test/CodeGen/AMDGPU/omod.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11PLUS,GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11PLUS,GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11PLUS,GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11PLUS,GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck --check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck --check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11PLUS,GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11PLUS,GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11PLUS,GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11PLUS,GFX12,GFX12-FAKE16 %s
; IEEE bit enabled for compute kernel, so shouldn't use.
define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
diff --git a/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll b/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll
index 9dcb9b1..000d313 100644
--- a/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -mtriple=amdgcn | FileCheck --check-prefix=SI %s
; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
; Make sure the OpenCL Image lowering pass doesn't crash when argument metadata
diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
index 778d73f..1427225 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -early-live-intervals < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -early-live-intervals < %s | FileCheck %s
; CHECK-LABEL: {{^}}fold_sgpr:
; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s
diff --git a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
index 5425ff7..98d48e5 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
; Make sure there isn't an extra space between the instruction name and first operands.
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
index e798646..51db31d 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @if_masked_1(i32 %arg, ptr addrspace(1) %p) {
; GCN-LABEL: if_masked_1:
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index 720eaef..0887f41 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
; GCN-LABEL: negated_cond:
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 1abd2e6..7ef87a4e 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/or3.ll b/llvm/test/CodeGen/AMDGPU/or3.ll
index acf74d3..0726cd5 100644
--- a/llvm/test/CodeGen/AMDGPU/or3.ll
+++ b/llvm/test/CodeGen/AMDGPU/or3.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji | FileCheck -check-prefix=VI %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 | FileCheck -check-prefix=GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 | FileCheck -check-prefix=GFX10 %s
; ===================================================================================
; V_OR3_B32
diff --git a/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll b/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll
index d1469ed..c39a887 100644
--- a/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s
; Testcase which happened to trigger a liveness verifier error
define amdgpu_kernel void @test_long_add4(<4 x i64> %arg) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index b1ce5a3..ec15837 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck --check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=GFX7 %s
define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index 5803821..e065b8e 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX803 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GFX803 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 0e1e5e4..9c38d7f 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -1,9 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX90A-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX90A-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX942-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX942-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX90A-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX90A-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX942-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX942-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-GISEL %s
define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
; GFX900-LABEL: fadd_v2_vv:
@@ -29,6 +31,17 @@ define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fadd_v2_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -61,6 +74,17 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fadd_v2_vs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -112,6 +136,34 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v4_vs:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v4_vs:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
%load = load <4 x float>, ptr addrspace(1) %gep, align 16
@@ -277,6 +329,115 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v32_vs:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_clause 0x7
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1]
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[12:13]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[14:15]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[10:11]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[16:17]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[40:41]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[38:39]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[48:49]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[44:45]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[46:47]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[50:51]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[36:37]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[42:43]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[18:19]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[20:21]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[22:23]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[8:9]
+; GFX1250-SDAG-NEXT: s_clause 0x7
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v32_vs:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_clause 0x7
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112
+; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[16:17]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[18:19]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[20:21]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[22:23]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[24:25]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[26:27]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[28:29]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[30:31]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[0:1]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[2:3]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[4:5]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[8:9]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[10:11]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[12:13]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[14:15]
+; GFX1250-GISEL-NEXT: s_clause 0x7
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35]
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
%load = load <32 x float>, ptr addrspace(1) %gep, align 128
@@ -325,6 +486,32 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v2_v_imm:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0x42c80000
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_imm:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -370,6 +557,30 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1]
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v2_v_v_splat:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_v_splat:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1]
+; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -419,6 +630,31 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v2_v_lit_splat:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_lit_splat:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1.0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -452,6 +688,29 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v2_v_lit_hi0:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x3f800000
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -486,6 +745,18 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) {
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fadd_v2_v_lit_lo0:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x3f80000000000000)
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -520,6 +791,18 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fadd_v2_v_unfoldable_lit:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000)
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -570,6 +853,31 @@ define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) {
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v2_v_fneg:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_fneg:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -622,6 +930,31 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) {
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_lo:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_lo:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, s2
+; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -674,6 +1007,31 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) {
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_hi:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_hi:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_hi:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v3, -s2, -s2
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -723,6 +1081,31 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, flo
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_lo2:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] neg_lo:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_lo2:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -772,6 +1155,31 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, flo
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_hi2:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_hi2:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v3, -s2, -s2
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -807,6 +1215,17 @@ define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) {
; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[0:1]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fmul_v2_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[0:1]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -839,6 +1258,17 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fmul_v2_vs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -890,6 +1320,34 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fmul_v4_vs:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fmul_v4_vs:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
%load = load <4 x float>, ptr addrspace(1) %gep, align 16
@@ -1055,6 +1513,115 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fmul_v32_vs:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_clause 0x7
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1]
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[12:13]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[14:15]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[10:11]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[16:17]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[40:41]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[38:39]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[48:49]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[44:45]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[46:47]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[50:51]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[36:37]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[42:43]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[18:19]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[20:21]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[22:23]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[8:9]
+; GFX1250-SDAG-NEXT: s_clause 0x7
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fmul_v32_vs:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_clause 0x7
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112
+; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[16:17]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[18:19]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[20:21]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[22:23]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[24:25]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[26:27]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[28:29]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[30:31]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[0:1]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[2:3]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[4:5]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[8:9]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[10:11]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[12:13]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[14:15]
+; GFX1250-GISEL-NEXT: s_clause 0x7
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35]
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
%load = load <32 x float>, ptr addrspace(1) %gep, align 128
@@ -1102,6 +1669,32 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fmul_v2_v_imm:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0x42c80000
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fmul_v2_v_imm:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1147,6 +1740,30 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[0:1]
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fmul_v2_v_v_splat:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fmul_v2_v_v_splat:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[0:1]
+; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1196,6 +1813,31 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fmul_v2_v_lit_splat:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0 op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fmul_v2_v_lit_splat:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1230,6 +1872,18 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fmul_v2_v_unfoldable_lit:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000)
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1279,6 +1933,31 @@ define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) {
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fmul_v2_v_fneg:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fmul_v2_v_fneg:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1314,6 +1993,17 @@ define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) {
; PACKED-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fma_v2_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1346,6 +2036,17 @@ define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
; PACKED-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fma_v2_vs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1397,6 +2098,34 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fma_v4_vs:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fma_v4_vs:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
%load = load <4 x float>, ptr addrspace(1) %gep, align 16
@@ -1562,6 +2291,115 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fma_v32_vs:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_clause 0x7
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1]
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[12:13], s[12:13]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[14:15], s[14:15]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[10:11], s[10:11]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[16:17], s[16:17]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[40:41], s[40:41]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[38:39], s[38:39]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[48:49], s[48:49]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[44:45], s[44:45]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[46:47], s[46:47]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[50:51], s[50:51]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[36:37], s[36:37]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[42:43], s[42:43]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[18:19], s[18:19]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[20:21], s[20:21]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[22:23], s[22:23]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[8:9], s[8:9]
+; GFX1250-SDAG-NEXT: s_clause 0x7
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fma_v32_vs:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_clause 0x7
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112
+; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[16:17], s[16:17]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[18:19], s[18:19]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[20:21], s[20:21]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[22:23], s[22:23]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[24:25], s[24:25]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[26:27], s[26:27]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[28:29], s[28:29]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[30:31], s[30:31]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[4:5], s[4:5]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[6:7], s[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[8:9], s[8:9]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[10:11], s[10:11]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[12:13], s[12:13]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[14:15], s[14:15]
+; GFX1250-GISEL-NEXT: s_clause 0x7
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35]
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
%load = load <32 x float>, ptr addrspace(1) %gep, align 128
@@ -1632,6 +2470,36 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fma_v2_v_imm:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0x43480000
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_mov_b32 s4, 0x42c80000
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[4:5], s[2:3] op_sel_hi:[1,0,0]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fma_v2_v_imm:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0x43480000
+; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: s_mov_b32 s5, s4
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5]
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1677,6 +2545,30 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[0:1], v[0:1]
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fma_v2_v_v_splat:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[0:1], v[0:1] op_sel_hi:[1,0,0]
+; GFX1250-SDAG-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fma_v2_v_v_splat:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[0:1], v[0:1]
+; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1746,6 +2638,33 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fma_v2_v_lit_splat:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0 op_sel_hi:[1,0,0]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fma_v2_v_lit_splat:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1.0
+; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: s_mov_b32 s5, s4
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5]
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1817,6 +2736,34 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fma_v2_v_unfoldable_lit:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000)
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_mov_b64 s[4:5], lit64(0x4040000040800000)
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[4:5], s[2:3]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fma_v2_v_unfoldable_lit:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000)
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_mov_b64 s[4:5], lit64(0x400000003f800000)
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5]
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1866,6 +2813,31 @@ define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) {
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fma_v2_v_fneg:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fma_v2_v_fneg:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1922,6 +2894,35 @@ define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %ou
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: add_vector_neg_bitcast_scalar_lo:
+; GFX1250-SDAG: ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-SDAG-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
+; GFX1250-SDAG-NEXT: ds_load_b32 v2, v2
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: add_vector_neg_bitcast_scalar_lo:
+; GFX1250-GISEL: ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-GISEL-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
+; GFX1250-GISEL-NEXT: ds_load_b32 v2, v2
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -v2, -v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 4
%scalar0 = load volatile float, ptr addrspace(3) %arg2, align 4
@@ -1986,6 +2987,38 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
+; GFX1250-SDAG: ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v2, s2
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v5, s3
+; GFX1250-SDAG-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
+; GFX1250-SDAG-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3
+; GFX1250-SDAG-NEXT: ds_load_b32 v4, v5
+; GFX1250-SDAG-NEXT: ds_load_b32 v5, v5 offset:8
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v6, v[0:1], s[0:1]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
+; GFX1250-GISEL: ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-GISEL-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
+; GFX1250-GISEL-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3
+; GFX1250-GISEL-NEXT: ds_load_b32 v4, v5
+; GFX1250-GISEL-NEXT: ds_load_b32 v5, v5 offset:8
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
%arg2.gep = getelementptr inbounds float, ptr addrspace(3) %arg2, i32 2
@@ -2048,6 +3081,31 @@ define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace(
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: shuffle_add_f32:
+; GFX1250-SDAG: ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v2, s2
+; GFX1250-SDAG-NEXT: ds_load_b64 v[0:1], v2
+; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 offset:8
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: shuffle_add_f32:
+; GFX1250-GISEL: ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-GISEL-NEXT: ds_load_b64 v[0:1], v2
+; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 offset:8
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0]
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8
%lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
@@ -2111,6 +3169,39 @@ define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrsp
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: shuffle_neg_add_f32:
+; GFX1250-SDAG: ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v2, s2
+; GFX1250-SDAG-NEXT: ds_load_b64 v[0:1], v2
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: ds_load_b32 v3, v0
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 offset:8
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: shuffle_neg_add_f32:
+; GFX1250-GISEL: ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-GISEL-NEXT: ds_load_b64 v[0:1], v2
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: ds_load_b32 v3, v0
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 offset:8
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0]
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8
%lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
@@ -2174,6 +3265,30 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v0
; GFX942-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_fadd_fsub_0:
+; GFX1250-SDAG: ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_add_f32 s1, s1, 0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-SDAG-NEXT: s_add_f32 s1, s1, 0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-SDAG-NEXT: flat_store_b64 v[0:1], v[0:1]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_fadd_fsub_0:
+; GFX1250-GISEL: ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v0, v1
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-GISEL-NEXT: flat_store_b64 v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: s_endpgm
bb:
%i12 = fadd <2 x float> zeroinitializer, %arg
%shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
@@ -2248,6 +3363,38 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX942-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_fadd_fsub:
+; GFX1250-SDAG: ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_add_f32 s6, s1, s3
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], s[2:3], s[6:7] op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[4:5]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_fadd_fsub:
+; GFX1250-GISEL: ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: s_sub_f32 s0, s0, s2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_3)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v2, s0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[2:3], v[0:1]
+; GFX1250-GISEL-NEXT: v_dual_subrev_f32 v3, s3, v0 :: v_dual_mov_b32 v0, 0
+; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[4:5]
+; GFX1250-GISEL-NEXT: s_endpgm
bb:
%i12 = fadd <2 x float> %arg, %arg1
%shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
@@ -2300,6 +3447,32 @@ define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) {
; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
; PACKED-GISEL-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_shuffle_v4:
+; GFX1250-SDAG: ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1] op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_shuffle_v4:
+; GFX1250-GISEL: ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v6, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX1250-GISEL-NEXT: global_store_b128 v6, v[0:3], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
@@ -2346,6 +3519,28 @@ define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fneg_v2f32_vec:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 neg_lo:[1,1] neg_hi:[1,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fneg_v2f32_vec:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -2387,6 +3582,26 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fneg_v2f32_scalar:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX1250-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fneg_v2f32_scalar:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: s_endpgm
%fneg = fsub <2 x float> <float -0.0, float -0.0>, %x
store <2 x float> %fneg, ptr addrspace(1) %a, align 8
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
index ae35d0d..581ce28 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
; CHECK: .amdgpu_pal_metadata
; CHECK-NEXT: ---
@@ -17,6 +17,7 @@
; CHECK-NEXT: .debug_mode: 0
; CHECK-NEXT: .excp_en: 0
; CHECK-NEXT: .float_mode: 0xc0
+; CHECK-NEXT: .forward_progress: true
; CHECK-NEXT: .image_op: false
; CHECK-NEXT: .lds_size: 0x200
; CHECK-NEXT: .mem_ordered: true
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
index 638dc89..6b7d704 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,GFX11 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,GFX12 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,GFX12,DVGPR %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefixes=CHECK,GFX11 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck --check-prefixes=CHECK,GFX12 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr < %s | FileCheck --check-prefixes=CHECK,GFX12,DVGPR %s
; CHECK: .amdgpu_pal_metadata
; CHECK-NEXT: ---
@@ -19,6 +19,7 @@
; CHECK-NEXT: .debug_mode: 0
; CHECK-NEXT: .excp_en: 0
; CHECK-NEXT: .float_mode: 0xc0
+; CHECK-NEXT: .forward_progress: true
; GFX11-NEXT: .ieee_mode: true
; CHECK-NEXT: .image_op: false
; CHECK-NEXT: .lds_size: 0x200
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
index fb6ac2e..c1846c0 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
@@ -59,6 +59,7 @@
; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
; CHECK-NEXT: .excp_en: 0
; CHECK-NEXT: .float_mode: 0xc0
+; CHECK-NEXT: .forward_progress: true
; CHECK-NEXT: .image_op: false
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .mem_ordered: true
@@ -113,6 +114,7 @@
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_gs
; CHECK-NEXT: .entry_point_symbol: gs_shader
+; CHECK-NEXT: .forward_progress: true
; CHECK-NEXT: .lds_size: 0x200
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
@@ -124,6 +126,7 @@
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_hs
; CHECK-NEXT: .entry_point_symbol: hs_shader
+; CHECK-NEXT: .forward_progress: true
; CHECK-NEXT: .lds_size: 0x1000
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
@@ -135,6 +138,7 @@
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_ps
; CHECK-NEXT: .entry_point_symbol: ps_shader
+; CHECK-NEXT: .forward_progress: true
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
index 15778c8..5c0c366 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -62,6 +62,7 @@
; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
; CHECK-NEXT: .excp_en: 0
; CHECK-NEXT: .float_mode: 0xc0
+; CHECK-NEXT: .forward_progress: true
; GFX11-NEXT: .ieee_mode: false
; CHECK-NEXT: .image_op: false
; CHECK-NEXT: .lds_size: 0
@@ -118,6 +119,7 @@
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_gs_main
; CHECK-NEXT: .entry_point_symbol: gs_shader
+; CHECK-NEXT: .forward_progress: true
; GFX11-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0x200
; CHECK-NEXT: .mem_ordered: true
@@ -130,6 +132,7 @@
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_hs_main
; CHECK-NEXT: .entry_point_symbol: hs_shader
+; CHECK-NEXT: .forward_progress: true
; GFX11-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0x1000
; CHECK-NEXT: .mem_ordered: true
@@ -142,6 +145,7 @@
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_ps_main
; CHECK-NEXT: .entry_point_symbol: ps_shader
+; CHECK-NEXT: .forward_progress: true
; GFX11-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .mem_ordered: true
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll
index 644722b..830872a 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll
@@ -62,6 +62,7 @@
; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
; CHECK-NEXT: .excp_en: 0
; CHECK-NEXT: .float_mode: 0xc0
+; CHECK-NEXT: .forward_progress: true
; GFX11-NEXT: .ieee_mode: false
; CHECK-NEXT: .image_op: false
; CHECK-NEXT: .lds_size: 0
@@ -118,6 +119,7 @@
; CHECK-NEXT: .debug_mode: false
; CHECK-NOT: .entry_point: _amdgpu_gs_main
; CHECK-NEXT: .entry_point_symbol: gs_shader
+; CHECK-NEXT: .forward_progress: true
; GFX11-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0x200
; CHECK-NEXT: .mem_ordered: true
@@ -130,6 +132,7 @@
; CHECK-NEXT: .debug_mode: false
; CHECK-NOT: .entry_point: _amdgpu_hs_main
; CHECK-NEXT: .entry_point_symbol: hs_shader
+; CHECK-NEXT: .forward_progress: true
; GFX11-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0x1000
; CHECK-NEXT: .mem_ordered: true
@@ -142,6 +145,7 @@
; CHECK-NEXT: .debug_mode: false
; CHECK-NOT: .entry_point: _amdgpu_ps_main
; CHECK-NEXT: .entry_point_symbol: ps_shader
+; CHECK-NEXT: .forward_progress: true
; GFX11-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .mem_ordered: true
diff --git a/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll
index 8121816..49aa24d 100644
--- a/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll
+++ b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=r600 -mcpu=redwood -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s
;
; CFG flattening should use parallel-and mode to generate branch conditions and
; then merge if-regions with the same bodies.
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index ce96766..a5c8f04 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,1 -verify-machineinstrs < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=prologepilog -verify-machineinstrs < %s | FileCheck -check-prefix=PEI-GFX908 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,1 -verify-machineinstrs < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=prologepilog -verify-machineinstrs < %s | FileCheck -check-prefix=PEI-GFX90A %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,1 < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX908 %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,1 < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX90A %s
; Partial reg copy and spill missed during regalloc handled later at frame lowering.
define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index 5025c1d..8f64e3c5 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=GCN %s
; FIXME: we should disable sdwa peephole because dead-code elimination, that
; runs after peephole, ruins this test (different register numbers)
diff --git a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
index a68b5a8..e37bfc6 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
; Test combine to reduce the width of a 64-bit shift to 32-bit if
; truncated to 16-bit.
diff --git a/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll b/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
index 70f4f96..c7b2125 100644
--- a/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
+++ b/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -verify-coalescing < %s
+; RUN: llc -mtriple=amdgcn -verify-coalescing < %s
; The original and requires materializing a 64-bit immediate for
; s_and_b64. This is split into 2 x v_and_i32, part of the immediate
diff --git a/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll b/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll
index 4ae0547..5d64359 100644
--- a/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll
+++ b/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll
@@ -1,7 +1,7 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 --stop-after=amdgpu-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG,SDAG-GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 --stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 --stop-after=amdgpu-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG,SDAG-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 --stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 --stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=SDAG,SDAG-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 --stop-after=instruction-select < %s | FileCheck -check-prefixes=GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 --stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=SDAG,SDAG-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 --stop-after=instruction-select < %s | FileCheck -check-prefixes=GISEL %s
declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1)
declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll
index cac983a..0d7e73c 100644
--- a/llvm/test/CodeGen/AMDGPU/permute.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
define amdgpu_kernel void @lsh8_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: lsh8_or_and:
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index a4ddfee..0741cb2 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck %s -check-prefixes=GFX9
define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle6766:
diff --git a/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll b/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll
index 9a6cfb7..d7b1598 100644
--- a/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll
+++ b/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; GCN-LABEL: {{^}}test_pk_max_f16_literal_0_1:
; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, 1.0 op_sel:[0,1] op_sel_hi:[1,0]{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
index beefc91..7a290a32 100644
--- a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+xnack -amdgpu-max-memory-clause=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+xnack -amdgpu-max-memory-clause=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; Test the behavior of the post-RA soft clause bundler in the presence
; of debug info. The debug info should not interfere with the
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 41fe0d4..efe4cfa 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s
define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0) #0 {
; GFX942-LABEL: ptr1_i8:
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
index 20ca575..3ce0947 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9ALL,GFX900 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9ALL,GFX906 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck --check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9ALL,GFX900 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefixes=GFX9ALL,GFX906 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define i16 @shl_i16(i16 %x, i16 %y) {
; GFX8-LABEL: shl_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-user-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/preserve-user-waitcnt.ll
index b485093..cd6ab0b 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-user-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-user-waitcnt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -O3 -mtriple=amdgcn < %s | FileCheck --check-prefix=CHECK %s
; SIInsertWaitcnts should preserve waitcnt instructions coming from the user
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index 2d95ec6..f4a9e7e 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck -check-prefix=GFX906 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
; Due to high register pressure, regalloc would split the liverange of wwm VGPR register used for SGPR spills
; and introduce a copy. The copy should be of whole-wave with exec mask manipulation around it.
diff --git a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
index e687ad9..f2c7aba 100644
--- a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,OPT %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,OPT %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=iceland -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,OPT %s
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,OPTNONE %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,OPT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope --check-prefixes=GCN,OPT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=iceland < %s | FileCheck -enable-var-scope --check-prefixes=GCN,OPT %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,OPTNONE %s
; There are no stack objects, but still a private memory access. The
; private access regiters need to be correctly initialized anyway, and
diff --git a/llvm/test/CodeGen/AMDGPU/prologue-epilogue-markers.ll b/llvm/test/CodeGen/AMDGPU/prologue-epilogue-markers.ll
index 79bcaf8..bf417b21 100644
--- a/llvm/test/CodeGen/AMDGPU/prologue-epilogue-markers.ll
+++ b/llvm/test/CodeGen/AMDGPU/prologue-epilogue-markers.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %s | llvm-dwarfdump --debug-line - | FileCheck --check-prefix=DWARFLINE %s
; Test that the prologue end line directive is emitted after all the prologue instructions
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
index ed0fe0d..01cc6ab 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mattr=+promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mattr=+promote-alloca,+max-private-element-size-4 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 < %s | FileCheck -check-prefix=GCN %s
; Pointer value is stored in a candidate for LDS usage.
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
index 554fa49..9fb7396 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
; GCN-LABEL: {{^}}float4_alloca_store4:
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index bbfd5f4..b1e0515 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
declare i64 @_Z13get_global_idj(i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
index 98f641a..81b9222 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s
; The type promotion for the vector loads v3i32/v3f32 into v4i32/v4f32 is enabled
; only when the alignment is 8-byte or higher.
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll
index 85514e6..4ad6835 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s
; GCN: foo1:
; v_cndmask_b32_e64 v0, 0, 1, vcc_lo{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
index 5b9b0fe..013b68a 100644
--- a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -mtriple=amdgcn-pal -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
-;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+;RUN: llc < %s -mtriple=amdgcn-pal -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
+;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
; ;CHECK: NumVgprs: 4
diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
index 0ac3d65..e674faf 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefixes=SDAG
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefixes=GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefixes=SDAG
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefixes=GISEL
define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr addrspace(8) noalias %b) {
; SDAG-LABEL: buffers_dont_alias:
diff --git a/llvm/test/CodeGen/AMDGPU/r600-constant-array-fixup.ll b/llvm/test/CodeGen/AMDGPU/r600-constant-array-fixup.ll
index e6c068f..3b6c71b 100644
--- a/llvm/test/CodeGen/AMDGPU/r600-constant-array-fixup.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600-constant-array-fixup.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj -mtriple=r600-mesa-mesa3d -mcpu=cypress -verify-machineinstrs < %s | llvm-readobj -r --symbols - | FileCheck %s
+; RUN: llc -filetype=obj -mtriple=r600-mesa-mesa3d -mcpu=cypress < %s | llvm-readobj -r --symbols - | FileCheck %s
@arr = internal unnamed_addr addrspace(4) constant [4 x i32] [i32 4, i32 5, i32 6, i32 7], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll b/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll
index 5c0192d..8723455 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
; This test just checks that the compiler doesn't crash.
diff --git a/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll
index 9f2cf98..5b21a36 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
-; RUN: llc -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM %s
+; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -mtriple=r600-- -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
; Loosely based on test/CodeGen/{X86,AArch64}/extract-lowbits.ll,
; but with all 64-bit tests, and tests with loads dropped.
diff --git a/llvm/test/CodeGen/AMDGPU/r600.global_atomics.ll b/llvm/test/CodeGen/AMDGPU/r600.global_atomics.ll
index 57d0fc5..15895b7 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.global_atomics.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -mtriple=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; TODO: Add _RTN versions and merge with the GCN test
diff --git a/llvm/test/CodeGen/AMDGPU/r600.sub.ll b/llvm/test/CodeGen/AMDGPU/r600.sub.ll
index 17b1c4a..19426c8 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s
declare i32 @llvm.r600.read.tidig.x() readnone
diff --git a/llvm/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
index 52b0eaf..009c8d0 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | \
+; RUN: llc -mtriple=r600 -mcpu=redwood < %s | \
; RUN: FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}tgid_x:
diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-registers-error-all-regs-reserved.ll b/llvm/test/CodeGen/AMDGPU/ran-out-of-registers-error-all-regs-reserved.ll
index c5a05e6..06c862c 100644
--- a/llvm/test/CodeGen/AMDGPU/ran-out-of-registers-error-all-regs-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-registers-error-all-regs-reserved.ll
@@ -1,6 +1,6 @@
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -vgpr-regalloc=greedy -verify-machineinstrs -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -vgpr-regalloc=basic -verify-machineinstrs -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -vgpr-regalloc=fast -verify-machineinstrs -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -vgpr-regalloc=greedy -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -vgpr-regalloc=basic -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -vgpr-regalloc=fast -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s
declare <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32, i32, <32 x i32>, i32 immarg, i32 immarg, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
index ce46e74..54c3b46 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s
; GCN-LABEL: {{^}}rcp_uint:
; GCN: v_rcp_iflag_f32_e32
diff --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
index a91bba4..bc26e1c 100644
--- a/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s
; CHECK: error: invalid register "flat_scratch_lo" for subtarget.
diff --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
index f2c639f..8e78178 100644
--- a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash llc -mtriple=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck %s
+; RUN: not --crash llc -mtriple=amdgcn < %s 2>&1 | FileCheck %s
; CHECK: invalid type for register "exec".
diff --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
index 02ee219..8e0de52a 100644
--- a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash llc -mtriple=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck %s
+; RUN: not --crash llc -mtriple=amdgcn < %s 2>&1 | FileCheck %s
; CHECK: invalid type for register "m0".
diff --git a/llvm/test/CodeGen/AMDGPU/read_register.ll b/llvm/test/CodeGen/AMDGPU/read_register.ll
index 63ae193a..f6a5af5 100644
--- a/llvm/test/CodeGen/AMDGPU/read_register.ll
+++ b/llvm/test/CodeGen/AMDGPU/read_register.ll
@@ -1,5 +1,5 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck %s
declare i32 @llvm.read_register.i32(metadata) #0
declare i64 @llvm.read_register.i64(metadata) #0
diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
index fd422b3..131c5f3 100644
--- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -1,15 +1,15 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
; -global-isel=1 SI run line skipped since store not yet implemented.
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-SDAG -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-SDAG -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
declare i64 @llvm.readcyclecounter() #0
diff --git a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
index 15f664c..ddbae64 100644
--- a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
@@ -1,8 +1,8 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX700
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX1100
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX1100
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck %s -check-prefixes=GCN,GFX700
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
declare i64 @llvm.readsteadycounter() #0
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll
index 509b882..8da7c29 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: reassoc_i32:
; GCN: s_add_i32 [[ADD1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll
index ff92db7..9a2ec9c 100644
--- a/llvm/test/CodeGen/AMDGPU/recursion.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursion.ll
@@ -1,6 +1,6 @@
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=V5 %s
; CHECK-LABEL: {{^}}recursive:
; CHECK: .set recursive.private_seg_size, 16+max(16384)
diff --git a/llvm/test/CodeGen/AMDGPU/reduce-build-vec-ext-to-ext-build-vec.ll b/llvm/test/CodeGen/AMDGPU/reduce-build-vec-ext-to-ext-build-vec.ll
index d835f69..4230fa7 100644
--- a/llvm/test/CodeGen/AMDGPU/reduce-build-vec-ext-to-ext-build-vec.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduce-build-vec-ext-to-ext-build-vec.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; Make sure reduceBuildVecExtToExtBuildVec combine doesn't regress
diff --git a/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
index 14e0203..47f0c4c 100644
--- a/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
@@ -1,6 +1,6 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}reduce_i64_load_align_4_width_to_i32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
index 80a2aebc..d73ab2b 100644
--- a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}store_v2i32_as_v4i16_align_4:
; GCN: s_load_dwordx2
diff --git a/llvm/test/CodeGen/AMDGPU/reduction.ll b/llvm/test/CodeGen/AMDGPU/reduction.ll
index 7f9044a..291eccd 100644
--- a/llvm/test/CodeGen/AMDGPU/reduction.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduction.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
define half @reduction_fadd_v4f16(<4 x half> %vec4) {
; GFX9-LABEL: reduction_fadd_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
index c9d0cf3..fef7332 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
@@ -45,13 +45,13 @@ body: |
INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $agpr0
%14:vgpr_32 = COPY killed $agpr0
- INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 11534346 /* regdef:VReg_512 */, def %7, 10158090 /* regdef:VReg_256 */, def %8, 4784138 /* regdef:VReg_128 */, def %9, 3670026 /* regdef:VReg_96 */, def %10, 3670026 /* regdef:VReg_96 */, def %11
+ INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 27262986 /* regdef:VReg_512 */, def %7, 13565962 /* regdef:VReg_256 */, def %8, 6094858 /* regdef:VReg_128 */, def %9, 4784138 /* regdef:VReg_96 */, def %10, 4784138 /* regdef:VReg_96 */, def %11
INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 11534345 /* reguse:VReg_512 */, %7
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10158089 /* reguse:VReg_256 */, %8
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_128 */, %9
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %10
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %11
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 27262985 /* reguse:VReg_512 */, %7
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 13565961 /* reguse:VReg_256 */, %8
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:VReg_128 */, %9
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_96 */, %10
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_96 */, %11
$agpr1 = COPY %14
INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9 /* reguse */, killed $agpr1
SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
index 45ca0d4..f2fd3a8 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -o - %s 2>%t.err | FileCheck -implicit-check-not=error %s
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -o - %s 2>%t.err | FileCheck -implicit-check-not=error %s
; RUN: FileCheck -check-prefix=ERR %s < %t.err
; This testcase would fail on an "illegal eviction". If the assert was
diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
index 35e11ad..4571f32 100644
--- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -asm-verbose -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -asm-verbose < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-unknown-amdhsa -asm-verbose -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/register-killed-error-after-alloc-failure1.ll b/llvm/test/CodeGen/AMDGPU/register-killed-error-after-alloc-failure1.ll
index 5e466a9..f60fca1 100644
--- a/llvm/test/CodeGen/AMDGPU/register-killed-error-after-alloc-failure1.ll
+++ b/llvm/test/CodeGen/AMDGPU/register-killed-error-after-alloc-failure1.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR -implicit-check-not=error %s
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR -implicit-check-not=error %s
; ERR: error: inline assembly requires more registers than available
; ERR-NOT: ERROR
diff --git a/llvm/test/CodeGen/AMDGPU/reject-agpr-usage-before-gfx908.ll b/llvm/test/CodeGen/AMDGPU/reject-agpr-usage-before-gfx908.ll
index ba1c3b4..6737fdc 100644
--- a/llvm/test/CodeGen/AMDGPU/reject-agpr-usage-before-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/reject-agpr-usage-before-gfx908.ll
@@ -1,5 +1,5 @@
-; RUN: not llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GCN %s
-; RUN: not llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GCN %s
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx900 < %s 2>&1 | FileCheck -check-prefixes=GCN %s
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx906 < %s 2>&1 | FileCheck -check-prefixes=GCN %s
; GCN: couldn't allocate input reg for constraint 'a'
diff --git a/llvm/test/CodeGen/AMDGPU/rel32.ll b/llvm/test/CodeGen/AMDGPU/rel32.ll
index 59d64f3..e57c2f6 100644
--- a/llvm/test/CodeGen/AMDGPU/rel32.ll
+++ b/llvm/test/CodeGen/AMDGPU/rel32.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
@g = protected local_unnamed_addr addrspace(4) externally_initialized global i32 0, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 5d0e4bf..8fe68ba 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -513,28 +513,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -545,7 +538,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1084,10 +1076,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -1900,28 +1892,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -1932,7 +1917,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -2471,10 +2455,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll b/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll
index 8383930..b3fbf16 100644
--- a/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -enable-misched=0 -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -enable-misched=0 -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s
; Scheduler disabled to work around issue #129028
diff --git a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
index dc5e442..c552f9d 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
+++ b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_remat_sgpr:
; GCN-NOT: v_writelane_b32
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll
index f57e86c..c899e35 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll
@@ -2,12 +2,12 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefix=EXTIMG %s
; RUN: FileCheck -allow-empty --check-prefix=WARN-EXTIMG %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefix=NOEXTIMG %s
; RUN: FileCheck --check-prefix=WARN-NOEXTIMG %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s
; Note: This test checks the IR, but also has a run line to codegen the file just to check we
; do not crash when trying to select those functions.
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll
index 0359bb7..a4edcac 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=bonaire -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX7,IR %s
; RUN: FileCheck --check-prefix=WARN-GFX7 %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s
; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=bonaire -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX7,IR %s
@@ -11,7 +11,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=fiji -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX8,IR %s
; RUN: FileCheck --check-prefix=WARN-GFX8 %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s
; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=fiji -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX8,IR %s
@@ -20,22 +20,22 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX9,GFX906,IR %s
; RUN: FileCheck --check-prefix=WARN-GFX906 %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX9,GFX90A,IR %s
; RUN: FileCheck --check-prefix=WARN-GFX90A %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX10,IR %s
; RUN: FileCheck --check-prefix=WARN-GFX10 %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11,IR %s
; RUN: FileCheck --check-prefix=WARN-GFX11 %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s
; Note: This test checks the IR, but also has a run line to codegen the file just to check we
; do not crash when trying to select those functions.
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll
index 2b1e399..87304e9 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll
@@ -2,12 +2,12 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=COMPATIBLE,IR %s
; RUN: FileCheck -allow-empty --check-prefix=WARN-COMPATIBLE %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,IR %s
; RUN: FileCheck --check-prefixes=WARN-INCOMPATIBLE %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s
; Note: This test checks the IR, but also has a run line to codegen the file just to check we
; do not crash when trying to select those functions.
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll
index efb8d83..d182d35 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=COMPATIBLE,REALTIME,MEMTIME %s
; RUN: FileCheck -allow-empty --check-prefixes=WARN-REALTIME,WARN-MEMTIME %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s
; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1030 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=COMPATIBLE,REALTIME,MEMTIME %s
@@ -11,7 +11,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s
; RUN: FileCheck --check-prefixes=WARN-NOREALTIME,WARN-NOMEMTIME %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 < %s
; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
index 038f49f3..3ea649f 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
@@ -1,23 +1,23 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -mattr=+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX906 %s
; RUN: FileCheck --check-prefix=WARN-GFX906 %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -mattr=+wavefrontsize64 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -mattr=+wavefrontsize64 < %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX90A %s
; RUN: FileCheck --check-prefix=WARN-GFX90A %s < %t
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+wavefrontsize64 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+wavefrontsize64 < %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 < %s
; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 < %s
; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s
diff --git a/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll b/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
index 5f6e207..9e20cf3 100644
--- a/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o /dev/null %s
+; RUN: llc -o /dev/null %s
; Check that renameDisconnectedComponents() does not create vregs without a
; definition on every path (there should at least be IMPLICIT_DEF instructions).
target triple = "amdgcn--"
diff --git a/llvm/test/CodeGen/AMDGPU/resource-usage-pal.ll b/llvm/test/CodeGen/AMDGPU/resource-usage-pal.ll
index 05f1d59..2fbf2e2a 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-usage-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-usage-pal.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck %s
; Check that we do not assume any default stack size for PAL code object
; indirect calls. The driver knows the max recursion depth, so it can compute
diff --git a/llvm/test/CodeGen/AMDGPU/ret.ll b/llvm/test/CodeGen/AMDGPU/ret.ll
index 6746381..43f5c22 100644
--- a/llvm/test/CodeGen/AMDGPU/ret.ll
+++ b/llvm/test/CodeGen/AMDGPU/ret.ll
@@ -1,7 +1,7 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}vgpr:
; GCN-DAG: v_mov_b32_e32 v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/ret_jump.ll b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
index 4e9fb1a..d0bdf0dc 100644
--- a/llvm/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
; This should end with an no-op sequence of exec mask manipulations
; Mask should be in original state after executed unreachable block
diff --git a/llvm/test/CodeGen/AMDGPU/returnaddress.ll b/llvm/test/CodeGen/AMDGPU/returnaddress.ll
index 09243a5..babcd0d 100644
--- a/llvm/test/CodeGen/AMDGPU/returnaddress.ll
+++ b/llvm/test/CodeGen/AMDGPU/returnaddress.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GCN %s
; Test with zero frame
; GCN-LABEL: {{^}}func1
diff --git a/llvm/test/CodeGen/AMDGPU/rotate-add.ll b/llvm/test/CodeGen/AMDGPU/rotate-add.ll
index 53a49c9a..a295b1a 100644
--- a/llvm/test/CodeGen/AMDGPU/rotate-add.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotate-add.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.i64.ll b/llvm/test/CodeGen/AMDGPU/rotl.i64.ll
index 5a2a368..5839fd2 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.i64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
; BOTH-LABEL: {{^}}s_rotl_i64:
; BOTH-DAG: s_lshl_b64
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 0a746b0..2502067 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; R600-LABEL: rotl_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.i64.ll b/llvm/test/CodeGen/AMDGPU/rotr.i64.ll
index 4c7c801..76b57c6 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.i64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
; BOTH-LABEL: {{^}}s_rotr_i64:
; BOTH-DAG: s_sub_i32
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index d6e361d..74ac181 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; R600-LABEL: rotr_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index b1cea0e..dba10f1 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 < %s | FileCheck %s -check-prefix=GFX9
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -stress-regalloc=2 < %s | FileCheck %s -check-prefix=GFX11
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -stress-regalloc=2 < %s | FileCheck %s -check-prefix=GFX12
define void @test_remat_s_getpc_b64() {
; GFX9-LABEL: test_remat_s_getpc_b64:
diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
index 3140511..f14a5cc 100644
--- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack < %s | FileCheck -check-prefix=SI %s
; TODO: Some of those tests fail with OS == amdhsa due to unreasonable register
; allocation differences.
diff --git a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
index d8dd47c..8176e77 100644
--- a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
; SI-LABEL: {{^}}s_movk_i32_k0:
; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
diff --git a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
index d54edbc..ab98e81 100644
--- a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack < %s | FileCheck -check-prefix=GFX8 %s
define amdgpu_kernel void @s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) {
; GFX6-LABEL: s_mulk_i32_k0:
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 8f25e65..0b58b32 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -earlycse-debug-hash -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -earlycse-debug-hash < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_pat1:
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index 4177179..8861b772 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
index 3a57361..ef7e8a5 100644
--- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -1,6 +1,6 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI -check-prefix=CI-NOHSA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI -check-prefix=CI-NOHSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.workitem.id.y() #0
diff --git a/llvm/test/CodeGen/AMDGPU/save-fp.ll b/llvm/test/CodeGen/AMDGPU/save-fp.ll
index 4d18a0d..cd0fc54 100644
--- a/llvm/test/CodeGen/AMDGPU/save-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/save-fp.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX900 %s
define void @foo() {
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll b/llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll
index 9c1060ee..34d672c 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx802 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
; This checks for a bug where uniform control flow can result in multiple
; v_cmp results being combined together with s_and_b64, s_or_b64 and s_xor_b64,
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll
index 4865290..689e918 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel < %s | FileCheck %s
define amdgpu_vs float @sitofp_i32_to_f32(i32 inreg %val) {
; CHECK-LABEL: sitofp_i32_to_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll
index debbfce..b2770f3 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX1150 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX1150 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX12 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 < %s | FileCheck -check-prefixes=CHECK,GFX1150 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel < %s | FileCheck -check-prefixes=CHECK,GFX1150 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=CHECK,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel < %s | FileCheck -check-prefixes=CHECK,GFX12 %s
define amdgpu_vs float @fadd_f32(float inreg %a, float inreg %b) {
; CHECK-LABEL: fadd_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
index 3d283d6..6aa33ce5 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel < %s | FileCheck -check-prefix=GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel < %s | FileCheck -check-prefix=GISEL %s
define amdgpu_vs void @f32_olt(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
; SDAG-LABEL: f32_olt:
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 3fbfd75..52ef811 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX89,VI
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -| FileCheck %s --check-prefixes=GFX89,GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mattr=-flat-for-global | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefixes=GFX89,VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -| FileCheck %s --check-prefixes=GFX89,GFX9
; XXX - Why the packing?
define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index 29448ab..e8e122e 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX906 %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=GFX908 %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX906 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=GFX908 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s
define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX900-LABEL: scalar_to_vector_v8i16:
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
index b37a66d..808e60f 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; GCN-LABEL: {{^}}scalar_to_vector_i16:
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 42
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
new file mode 100644
index 0000000..735720a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
@@ -0,0 +1,426 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps float @flat_load_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom
+ %ret = load float, ptr %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b32_idx32(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b32_idx32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %arrayidx = getelementptr inbounds float, ptr %p, i32 %idx
+ %ret = load float, ptr %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b32_idxprom_wrong_stride(ptr align 4 inreg %p, i32 %idx) {
+; SDAG-LABEL: flat_load_b32_idxprom_wrong_stride:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
+; SDAG-NEXT: flat_load_b32 v0, v[0:1]
+; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: flat_load_b32_idxprom_wrong_stride:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[0:1]
+; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
+; GISEL-NEXT: flat_load_b32 v0, v[0:1]
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom
+ %ret = load float, ptr %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b16_idxprom_ioffset(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b16_idxprom_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd
+ %ld = load i16, ptr %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @flat_load_b64_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b64_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom
+ %ret = load <2 x float>, ptr %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @flat_load_b96_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b96_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom
+ %ret = load <3 x float>, ptr %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @flat_load_b96_idxpromi_ioffset(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b96_idxpromi_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd
+ %ret = load <3 x float>, ptr %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @flat_load_b128_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b128_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom
+ %ret = load <4 x float>, ptr %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps float @flat_load_b32_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b32_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom
+ %ret = load float, ptr %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b32_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b32_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] offset:64 scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxadd
+ %ret = load float, ptr %arrayidx, align 4
+ ret float %ret
+}
+
+; Note: this is a byte load, there is nothing to scale
+
+define amdgpu_ps float @flat_load_b8_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b8_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_u8 v0, v0, s[0:1] offset:16
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i8, ptr %p, i64 %idxadd
+ %ld = load i8, ptr %arrayidx
+ %ret.i32 = zext i8 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b16_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b16_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom
+ %ld = load i16, ptr %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b16_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b16_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd
+ %ld = load i16, ptr %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @flat_load_b64_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b64_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom
+ %ret = load <2 x float>, ptr %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b96_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom
+ %ret = load <3 x float>, ptr %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b96_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd
+ %ret = load <3 x float>, ptr %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @flat_load_b128_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b128_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom
+ %ret = load <4 x float>, ptr %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps void @flat_store_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_store_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1.0
+; GCN-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom
+ store float 1.0, ptr %arrayidx, align 4
+ ret void
+}
+
+define amdgpu_ps void @flat_store_b16_idxprom(ptr align 2 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_store_b16_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom
+ store i16 1, ptr %arrayidx, align 2
+ ret void
+}
+
+define amdgpu_ps void @flat_store_b64_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_store_b64_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0
+; GCN-NEXT: flat_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds double, ptr %p, i64 %idxprom
+ store double 1.0, ptr %arrayidx, align 4
+ ret void
+}
+
+define amdgpu_ps void @flat_atomicrmw_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_atomicrmw_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: flat_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idxprom
+ atomicrmw add ptr %arrayidx, i32 1 monotonic
+ ret void
+}
+
+define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %p, i32 %idx) {
+; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; SDAG-NEXT: s_mov_b32 s0, exec_lo
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
+; SDAG-NEXT: s_cbranch_execnz .LBB21_3
+; SDAG-NEXT: ; %bb.1: ; %Flow
+; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
+; SDAG-NEXT: s_cbranch_execnz .LBB21_4
+; SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi
+; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-NEXT: s_branch .LBB21_5
+; SDAG-NEXT: .LBB21_3: ; %atomicrmw.global
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 1
+; SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT: s_wait_xcnt 0x0
+; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
+; SDAG-NEXT: s_cbranch_execz .LBB21_2
+; SDAG-NEXT: .LBB21_4: ; %atomicrmw.private
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo
+; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
+; SDAG-NEXT: s_wait_loadcnt 0x0
+; SDAG-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
+; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
+; SDAG-NEXT: s_wait_xcnt 0x0
+; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; SDAG-NEXT: s_branch .LBB21_5
+; SDAG-NEXT: .LBB21_5:
+;
+; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: v_mov_b32_e32 v2, v0
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[2:3], src_private_base
+; GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[2:3]
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v0
+; GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo
+; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_cmpx_ne_u32_e64 s3, v5
+; GISEL-NEXT: s_xor_b32 s2, exec_lo, s2
+; GISEL-NEXT: s_cbranch_execnz .LBB21_3
+; GISEL-NEXT: ; %bb.1: ; %Flow
+; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2
+; GISEL-NEXT: s_cbranch_execnz .LBB21_4
+; GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi
+; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_branch .LBB21_5
+; GISEL-NEXT: .LBB21_3: ; %atomicrmw.global
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], 1
+; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT: flat_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GISEL-NEXT: s_wait_xcnt 0x0
+; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2
+; GISEL-NEXT: s_cbranch_execz .LBB21_2
+; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
+; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off
+; GISEL-NEXT: s_wait_xcnt 0x0
+; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GISEL-NEXT: s_branch .LBB21_5
+; GISEL-NEXT: .LBB21_5:
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i64, ptr %p, i64 %idxprom
+ %ret = atomicrmw add ptr %arrayidx, i64 1 monotonic
+ %ret.cast = bitcast i64 %ret to <2 x float>
+ ret <2 x float> %ret.cast
+}
+
+!0 = !{i32 0, i32 1024}
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
new file mode 100644
index 0000000..faea84e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
@@ -0,0 +1,351 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps float @global_load_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(1) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @global_load_b32_idx32(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b32_idx32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i32 %idx
+ %ret = load float, ptr addrspace(1) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @global_load_b32_idxprom_wrong_stride(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; SDAG-LABEL: global_load_b32_idxprom_wrong_stride:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
+; SDAG-NEXT: global_load_b32 v0, v[0:1], off
+; SDAG-NEXT: s_wait_loadcnt 0x0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: global_load_b32_idxprom_wrong_stride:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[0:1]
+; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
+; GISEL-NEXT: global_load_b32 v0, v[0:1], off
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(1) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @global_load_b16_idxprom_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b16_idxprom_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd
+ %ld = load i16, ptr addrspace(1) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @global_load_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b64_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @global_load_b96_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b96_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom
+ %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @global_load_b96_idxpromi_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b96_idxpromi_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd
+ %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @global_load_b128_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b128_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps float @global_load_b32_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b32_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(1) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @global_load_b32_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b32_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_b32 v0, v0, s[0:1] offset:64 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxadd
+ %ret = load float, ptr addrspace(1) %arrayidx, align 4
+ ret float %ret
+}
+
+; Note: this is a byte load, there is nothing to scale
+
+define amdgpu_ps float @global_load_b8_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b8_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_u8 v0, v0, s[0:1] offset:16
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %idxadd
+ %ld = load i8, ptr addrspace(1) %arrayidx
+ %ret.i32 = zext i8 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @global_load_b16_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b16_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_u16 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom
+ %ld = load i16, ptr addrspace(1) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @global_load_b16_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b16_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd
+ %ld = load i16, ptr addrspace(1) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @global_load_b64_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b64_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @global_load_b96_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b96_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom
+ %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @global_load_b96_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b96_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd
+ %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @global_load_b128_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b128_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps void @global_store_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_store_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1.0
+; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
+ store float 1.0, ptr addrspace(1) %arrayidx, align 4
+ ret void
+}
+
+define amdgpu_ps void @global_store_b16_idxprom(ptr addrspace(1) align 2 inreg %p, i32 %idx) {
+; GCN-LABEL: global_store_b16_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom
+ store i16 1, ptr addrspace(1) %arrayidx, align 2
+ ret void
+}
+
+define amdgpu_ps void @global_store_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_store_b64_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0
+; GCN-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds double, ptr addrspace(1) %p, i64 %idxprom
+ store double 1.0, ptr addrspace(1) %arrayidx, align 4
+ ret void
+}
+
+define amdgpu_ps void @global_atomicrmw_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_atomicrmw_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %idxprom
+ atomicrmw add ptr addrspace(1) %arrayidx, i32 1 monotonic
+ ret void
+}
+
+define amdgpu_ps <2 x float> @global_atomicrmw_b64_rtn_idxprom(ptr addrspace(1) align 8 inreg %p, i32 %idx) {
+; GCN-LABEL: global_atomicrmw_b64_rtn_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b64_e32 v[2:3], 1
+; GCN-NEXT: global_atomic_add_u64 v[0:1], v0, v[2:3], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %p, i64 %idxprom
+ %ret = atomicrmw add ptr addrspace(1) %arrayidx, i64 1 monotonic
+ %ret.cast = bitcast i64 %ret to <2 x float>
+ ret <2 x float> %ret.cast
+}
+
+!0 = !{i32 0, i32 1024}
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
new file mode 100644
index 0000000..27ecc83
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+define amdgpu_ps float @scratch_load_b32_alloca_idxprom(i32 %idx) {
+; GCN-LABEL: scratch_load_b32_alloca_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b32 v0, v0, off scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %p = alloca [32 x i32], align 4, addrspace(5)
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(5) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(5) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b32_idx32(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b32_idx32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i32 %idx
+ %ret = load float, ptr addrspace(5) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b32_idxprom_wrong_stride(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b32_idxprom_wrong_stride:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT: scratch_load_b32 v0, v0, s0
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(5) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b16_idxprom_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b16_idxprom_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxadd
+ %ld = load i16, ptr addrspace(5) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @scratch_load_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b64_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom
+ %ret = load <2 x float>, ptr addrspace(5) %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @scratch_load_b96_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b96_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxprom
+ %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @scratch_load_b96_idxpromi_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b96_idxpromi_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxadd
+ %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @scratch_load_b128_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b128_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(5) %p, i64 %idxprom
+ %ret = load <4 x float>, ptr addrspace(5) %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps float @scratch_load_b32_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b32_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b32 v0, v0, off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(5) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b32_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b32_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b32 v0, v0, off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: scratch_load_b32 v0, v0, s0 offset:64 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxadd
+ %ret = load float, ptr addrspace(5) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b8_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b8_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b32 v0, v0, off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: scratch_load_u8 v0, v0, s0 offset:16
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i8, ptr addrspace(5) %p, i64 %idxadd
+ %ld = load i8, ptr addrspace(5) %arrayidx
+ %ret.i32 = zext i8 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b16_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b16_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b32 v0, v0, off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: scratch_load_u16 v0, v0, s0 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom
+ %ld = load i16, ptr addrspace(5) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b16_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b16_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b32 v0, v0, off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxadd
+ %ld = load i16, ptr addrspace(5) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @scratch_load_b64_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b64_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b32 v0, v0, off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom
+ %ret = load <2 x float>, ptr addrspace(5) %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+; Multiplication is unsigned here, so we cannot match it.
+
+define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b96_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b32 v0, v0, off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxprom
+ %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b96_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b32 v0, v0, off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxadd
+ %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @scratch_load_b128_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b128_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: scratch_load_b32 v0, v0, off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(5) %p, i64 %idxprom
+ %ret = load <4 x float>, ptr addrspace(5) %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps void @scratch_store_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_store_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1.0
+; GCN-NEXT: scratch_store_b32 v0, v1, s0 scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom
+ store float 1.0, ptr addrspace(5) %arrayidx, align 4
+ ret void
+}
+
+define amdgpu_ps void @scratch_store_b16_idxprom(ptr addrspace(5) align 2 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_store_b16_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: scratch_store_b16 v0, v1, s0 scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom
+ store i16 1, ptr addrspace(5) %arrayidx, align 2
+ ret void
+}
+
+define amdgpu_ps void @scratch_store_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_store_b64_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0
+; GCN-NEXT: scratch_store_b64 v0, v[2:3], s0 scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds double, ptr addrspace(5) %p, i64 %idxprom
+ store double 1.0, ptr addrspace(5) %arrayidx, align 4
+ ret void
+}
+
+!0 = !{i32 0, i32 1024}
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
new file mode 100644
index 0000000..b5bb68e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
@@ -0,0 +1,372 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps float @s_load_b32_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+; 'i32 %idx' is a signed index while SMRD soffset is unsigned, thus it is not selected.
+
+define amdgpu_ps float @s_load_b32_idx32(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; SDAG-LABEL: s_load_b32_idx32:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_ashr_i32 s3, s2, 31
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
+; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0
+; SDAG-NEXT: s_wait_kmcnt 0x0
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: s_load_b32_idx32:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
+; GISEL-NEXT: s_add_co_u32 s0, s0, s2
+; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3
+; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: ; return to shader part epilog
+entry:
+ %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i32 %idx
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_wrong_stride(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; SDAG-LABEL: s_load_b32_idxprom_wrong_stride:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_mov_b32 s3, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0
+; SDAG-NEXT: s_wait_kmcnt 0x0
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: s_load_b32_idxprom_wrong_stride:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_mov_b32 s3, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; GISEL-NEXT: s_add_co_u32 s0, s0, s2
+; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3
+; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_ioffset(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b16_idxprom_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd
+ %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b64_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b96_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b128_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b256_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <8 x float> %ret
+}
+
+define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b512_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
+; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <16 x float> %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b32_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b32_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x40 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxadd
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+; Note: this is a byte load, there is nothing to scale
+
+define amdgpu_ps float @s_load_b8_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b8_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_u8 s0, s[0:1], s2 offset:0x10
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i8, ptr addrspace(4) %p, i64 %idxadd
+ %ld = load i8, ptr addrspace(4) %arrayidx
+ %ret.i32 = zext i8 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b16_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxprom
+ %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b16_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd
+ %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b64_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b96_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b128_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b256_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <8 x float> %ret
+}
+
+define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b512_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
+; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <16 x float> %ret
+}
+
+!0 = !{i32 0, i32 1024}
diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index 90dfd5a..15f5f89 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck %s
; This was a negative test to catch an extreme case when all options are exhausted
; while trying to spill SGPRs to memory. After we enabled SGPR spills into virtual VGPRs
diff --git a/llvm/test/CodeGen/AMDGPU/sched-setprio.ll b/llvm/test/CodeGen/AMDGPU/sched-setprio.ll
index a5e4b58..78a1471 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-setprio.ll
+++ b/llvm/test/CodeGen/AMDGPU/sched-setprio.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefix=GCN %s
declare void @llvm.amdgcn.s.setprio(i16)
declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-avoid-spills.ll b/llvm/test/CodeGen/AMDGPU/schedule-avoid-spills.ll
index 4096d32..98cc6ba 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-avoid-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-avoid-spills.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefix=GCN %s
; GCN-LABEL: {{^}}load_fma_store
; GCN-NOT: scratch_store
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
index 63d75f3..0517be5 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s
+; RUN: llc -mtriple=r600 -mcpu=cayman -stress-sched -verify-misched < %s
; REQUIRES: asserts
define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
index 48caabd..9145ca4 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s
+; RUN: llc -mtriple=r600 -mcpu=cayman -stress-sched -verify-misched < %s
; REQUIRES: asserts
define amdgpu_kernel void @main() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop.ll b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop.ll
index 8380bee0..b916151 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s
+; RUN: llc -mtriple=r600 -mcpu=cayman -stress-sched -verify-misched < %s
; REQUIRES: asserts
define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll b/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll
index c985737..e55cc7f 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
; FIXME: This currently doesn't do a great job of clustering the
; loads, which end up with extra moves between them. Right now, it
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll b/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll
index d6dc911..2baa955 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
;REQUIRES: asserts
define amdgpu_kernel void @main() {
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-if.ll b/llvm/test/CodeGen/AMDGPU/schedule-if.ll
index 0d3891d..fedea6e 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-if.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-if.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -mtriple=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
+;RUN: llc < %s -mtriple=r600 -mcpu=cayman -stress-sched -verify-misched
;REQUIRES: asserts
define amdgpu_kernel void @main() {
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
index 350ff94..317a70b 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-max-ilp -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-sched-strategy=max-ilp -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-max-ilp < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-sched-strategy=max-ilp < %s | FileCheck %s
; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}}
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
index 563eb45..e798dff 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FUNC -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
; FUNC-LABEL: {{^}}cluster_arg_loads:
; SI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
index b3eb305..6fb485c 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: opt -S -passes=always-inline -o %t.bc %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %t.bc | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy < %t.bc | FileCheck %s --check-prefixes=CHECK
; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully
; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs,
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
index bd1258c..ff3a1ea 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; Interleave loads and stores to fit into 9 VGPR limit.
; This requires to avoid load/store clustering.
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
index 3ba8038..6d53524 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
@@ -1,6 +1,6 @@
-; RUN: llc -enable-amdgpu-aa=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -enable-amdgpu-aa=0 -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -enable-amdgpu-aa=0 -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -enable-amdgpu-aa=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s
+; RUN: llc -enable-amdgpu-aa=0 -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-minreg < %s | FileCheck %s
+; RUN: llc -enable-amdgpu-aa=0 -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck %s
; We expect a two digit VGPR usage here, not a three digit.
; CHECK: NumVgprs: {{[0-9][0-9]$}}
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index 462ac23..22ea449 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -1,11 +1,11 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg < %s | FileCheck --check-prefix=SI-MINREG %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck --check-prefix=SI-MAXOCC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg < %s | FileCheck --check-prefix=SI-MINREG %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc < %s | FileCheck --check-prefix=SI-MAXOCC %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg < %s | FileCheck --check-prefix=VI-MINREG %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck --check-prefix=VI-MAXOCC %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg < %s | FileCheck --check-prefix=VI-MINREG %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc < %s | FileCheck --check-prefix=VI-MAXOCC %s
; SI-MINREG: NumSgprs: {{[1-9]$}}
; SI-MINREG: NumVgprs: {{[1-9]$}}
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
index ef24996..46044aa8 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=MISCHED %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-sched-strategy=iterative-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=MISCHED %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp < %s | FileCheck --check-prefix=GCN-ILP %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-sched-strategy=iterative-ilp < %s | FileCheck --check-prefix=GCN-ILP %s
; Test the scheduler when only one wave is requested. The result should be high register usage and max ILP.
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll
index 26f9ba4..27dc408 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll
@@ -1,6 +1,6 @@
; REQUIRES: asserts
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -debug-only=machine-scheduler -o /dev/null < %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -debug-only=machine-scheduler -o /dev/null < %s 2>&1 | FileCheck %s
; We are only targeting one wave. Check that the machine scheduler doesn't use
; register pressure heuristics to prioritize any candidate instruction.
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
index 5a30d5d..7b8eba1 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=OCC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs < %s | FileCheck --check-prefix=OCC-GCNTRACKER %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX-GCNTRACKER %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=OCC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 < %s | FileCheck --check-prefix=OCC-GCNTRACKER %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX-GCNTRACKER %s
; Using -amgpu-schedule-relaxed-occupancy allows scheduler to produce better ILP by further relaxing occupancy target
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
index c5e04b3..92d31e4 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
declare void @llvm.amdgcn.s.barrier() nounwind convergent
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll b/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
index a703ce0..57f08de 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=machine-scheduler -verify-machineinstrs < %s 2>&1 | FileCheck -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=machine-scheduler < %s 2>&1 | FileCheck -enable-var-scope %s
; REQUIRES: asserts
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
index 4ada730..d38294b 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; When a frame index offset is more than 12-bits, make sure we don't store
; it in mubuf's offset field.
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
index cdaac14..902e3ef 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GISEL
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GISEL
define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg %flag) {
; GCN-LABEL: sink_scratch_pointer:
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 0a67b2e..7a3bff8 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI,MUBUF %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI,MUBUF %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX9-MUBUF,GFX9_10-MUBUF %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GCN,SI,SIVI,MUBUF %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GCN,VI,SIVI,MUBUF %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX9-MUBUF,GFX9_10-MUBUF %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -filetype=obj -amdgpu-use-divergent-register-indexing < %s | llvm-readobj -r - | FileCheck --check-prefix=RELS %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX10_W32-MUBUF,GFX9_10-MUBUF %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX10_W64-MUBUF,GFX9_10-MUBUF %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX9-FLATSCR %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1030 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX10-FLATSCR %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX9-FLATSCR-PAL %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX10-FLATSCR-PAL %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,GFX11-FLATSCR %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,GFX11-FLATSCR %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX10_W32-MUBUF,GFX9_10-MUBUF %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX10_W64-MUBUF,GFX9_10-MUBUF %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX9-FLATSCR %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1030 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX10-FLATSCR %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX9-FLATSCR-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX10-FLATSCR-PAL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,GFX11-FLATSCR %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,GFX11-FLATSCR %s
; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD0
; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD1
diff --git a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
index e114f1c..fe27a99 100644
--- a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O0 -verify-machineinstrs < %s -debug-only=isel 2>&1 | FileCheck --check-prefixes=GCN,GCN-DEFAULT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O0 -verify-machineinstrs < %s -debug-only=isel -dag-dump-verbose 2>&1 | FileCheck --check-prefixes=GCN,GCN-VERBOSE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O0 < %s -debug-only=isel 2>&1 | FileCheck --check-prefixes=GCN,GCN-DEFAULT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O0 < %s -debug-only=isel -dag-dump-verbose 2>&1 | FileCheck --check-prefixes=GCN,GCN-VERBOSE %s
; REQUIRES: asserts
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index df49625..4addf42 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s
define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-LABEL: s_test_sdiv:
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-op64-test.ll b/llvm/test/CodeGen/AMDGPU/sdwa-op64-test.ll
index 38a96ee..6873ff0 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-op64-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-op64-test.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=FIJI,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=FIJI,GCN %s
; GCN-LABEL: {{^}}test_add_co_sdwa:
; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 38e4504..19f0e93 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOSDWA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX89 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX9_10,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX9_10,GFX10 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOSDWA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX9_10,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-sdwa-peephole < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX9_10,GFX10 %s
define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; NOSDWA-LABEL: add_shr_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
index 5eb3ae8..9896e5f 100644
--- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -o - %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
declare i32 @llvm.amdgcn.sffbh.i32(i32) nounwind readnone speculatable
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
index decee14..338c4eb 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
; --------------------------------------------------------------------------------
; Don't fold if fneg can fold into the source
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
index ec0455a..c402b69 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-no-signed-zeros-fp-math < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-no-signed-zeros-fp-math < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-no-signed-zeros-fp-math < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-no-signed-zeros-fp-math < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}add_select_fabs_fabs_f32:
; GCN: buffer_load_dword [[X:v[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/select-i1.ll b/llvm/test/CodeGen/AMDGPU/select-i1.ll
index 06a2d86..8185c9b 100644
--- a/llvm/test/CodeGen/AMDGPU/select-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-i1.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN
diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
index 9ef384f..87fdbab 100644
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; Make sure to test with f32 and i32 compares. If we have to use float
; compares, we always have multiple condition registers. If we can do
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index c8c40d4..bee00f6 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -1,6 +1,6 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
; Test expansion of scalar selects on vectors.
; Evergreen not enabled since it seems to be having problems with doubles.
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 3f921ad..bbdfc76 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-TRUE16
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global | FileCheck %s --check-prefixes=GFX11,GFX11-TRUE16
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
define amdgpu_kernel void @select_f16(
; SI-LABEL: select_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll
index 0ef41fb..de154b5 100644
--- a/llvm/test/CodeGen/AMDGPU/select64.ll
+++ b/llvm/test/CodeGen/AMDGPU/select64.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti | FileCheck -check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}select0:
; i64 select should be split into two i32 selects, and we shouldn't need
diff --git a/llvm/test/CodeGen/AMDGPU/selectcc.ll b/llvm/test/CodeGen/AMDGPU/selectcc.ll
index 2de0a20..a16ad927 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc.ll
@@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI -check-prefix=FUNC %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}selectcc_i64:
; EG: XOR_INT
diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
index 6f841c8..5c90957 100644
--- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=GCN
define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) {
; GCN-LABEL: if_then:
diff --git a/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
index 5f101c3..8e6fec0 100644
--- a/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
+++ b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel < %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-isel < %s 2>&1 | FileCheck %s
define amdgpu_ps void @_amdgpu_ps_main() {
; CHECK-LABEL: name: _amdgpu_ps_main
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll b/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
index 1883179..be85016 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
; Test fcmp pred (fneg x), c -> fcmp (swapped pred) x, -c combine.
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
index fffbda9..be3d5d1 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}const_load_no_shrink_dword_to_unaligned_byte:
; GCN: s_load_dword s{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
index 28c7693..031a55a 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0:
; GCN-NOT: v_cmp
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-sext.ll b/llvm/test/CodeGen/AMDGPU/setcc-sext.ll
index 4432ac4..83c3957 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-sext.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-sext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}setcc_sgt_true_sext:
; GCN: v_cmp_le_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll
index cc82f53..d25ca0e 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600-- -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=R600 -check-prefix=FUNC %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/setcc64.ll b/llvm/test/CodeGen/AMDGPU/setcc64.ll
index 438d8d2..b36ed3e 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc64.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s| FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn < %s| FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,VI %s
; XXX: Merge this into setcc, once R600 supports 64-bit operations
diff --git a/llvm/test/CodeGen/AMDGPU/seto.ll b/llvm/test/CodeGen/AMDGPU/seto.ll
index 9425857..9e20efc 100644
--- a/llvm/test/CodeGen/AMDGPU/seto.ll
+++ b/llvm/test/CodeGen/AMDGPU/seto.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s
; CHECK-LABEL: {{^}}main:
; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
diff --git a/llvm/test/CodeGen/AMDGPU/setuo.ll b/llvm/test/CodeGen/AMDGPU/setuo.ll
index 379bae4..dfecfce 100644
--- a/llvm/test/CodeGen/AMDGPU/setuo.ll
+++ b/llvm/test/CodeGen/AMDGPU/setuo.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s
; CHECK-LABEL: {{^}}main:
; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
index 8f94426..a0bac53 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a, i32 %b) {
; GCN-LABEL: sext_i16_to_i32_uniform:
diff --git a/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll b/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll
index e07c309..fd90e92 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_add:
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
index 660764d..96956486 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
;
; EG-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount:
; EG: MEM_{{.*}} MSKOR [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
index cc07ee4e..65fa2ca 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -1,6 +1,6 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX89,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX9,GFX89,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX89,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX9,GFX89,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600-- -mcpu=cypress < %s | FileCheck -enable-var-scope --check-prefixes=EG,FUNC %s
; FIXME: i16 promotion pass ruins the scalar cases when legal.
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 220e870..40b6f02 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefix=SI %s
;
; Most SALU instructions ignore control flow, so we need to make sure
; they don't overwrite values from other blocks.
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
index 8497448..63fd450 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
; Copy VGPR -> SGPR used twice as an instruction operand, which is then
; used in an REG_SEQUENCE that also needs to be handled.
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll
index 0902dae..c3a1911 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -o - %s | FileCheck %s
; CHECK-LABEL: {{^}}t0:
; CHECK: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[8:9], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
index 5a30386..c82b341 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s
; CHECK-LABEL: {{^}}phi1:
; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll
index 5692dc1..0aa44df 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck %s
; This tests for a bug that caused a crash in SIRegisterInfo::spillSGPR()
; which was due to incorrect book-keeping of removed dead frame indices.
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index 6d69b4c..fcf2aa4 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefix=GCN %s
; The first 64 SGPR spills can go to a VGPR, but there isn't a second
; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element.
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll
index c461020..076fff7 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck -check-prefix=GCN %s
; Make sure there's no verifier error from improperly updated
; SlotIndexes if regalloc fast is manually used.
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index 5824c7b..b52821e 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx803 -O0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx803 -O0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
define void @child_function() #0 {
; GCN-LABEL: child_function:
diff --git a/llvm/test/CodeGen/AMDGPU/sgprcopies.ll b/llvm/test/CodeGen/AMDGPU/sgprcopies.ll
index 5a66bff..c2ea526 100644
--- a/llvm/test/CodeGen/AMDGPU/sgprcopies.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgprcopies.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}checkTwoBlocksWithUniformBranch
; GCN: BB0_2
diff --git a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
index 363d568..8f3acec 100644
--- a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx600 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefixes=GCN,SICI,SI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefixes=GCN,SICI,CI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx600 <%s | FileCheck -enable-var-scope -check-prefixes=GCN,SICI,SI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 <%s | FileCheck -enable-var-scope -check-prefixes=GCN,SICI,CI %s
; Check that an addrspace(1) (const) load with various combinations of
; uniform, nonuniform and constant address components all load with an
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
index 37cf761..2b698d3 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
; Extract the high bit of the 1st quarter
define amdgpu_kernel void @v_uextract_bit_31_i128(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index 87083d6..6be41fb 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; FIXME: Fails with -enable-var-scope
; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half.
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index 5734c81..3a2d056 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s
define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) {
; GCN-LABEL: v_shl_i128_vv:
diff --git a/llvm/test/CodeGen/AMDGPU/shift-select.ll b/llvm/test/CodeGen/AMDGPU/shift-select.ll
index 72069e1..8e0cdeb 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-select.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8PLUS %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8PLUS %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8PLUS %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -stop-after=instruction-select < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -stop-after=instruction-select < %s | FileCheck -check-prefixes=GCN,GFX8PLUS %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -stop-after=instruction-select < %s | FileCheck -check-prefixes=GCN,GFX8PLUS %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -stop-after=instruction-select < %s | FileCheck -check-prefixes=GCN,GFX8PLUS %s
; GCN-LABEL: name: s_shl_i32
; GCN: S_LSHL_B32
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index a82a6a8..7aa7342 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=SI
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=VI
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=r600-- -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefixes=EG
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=verde | FileCheck %s --check-prefixes=SI
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global | FileCheck %s -check-prefixes=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=r600-- -mcpu=redwood | FileCheck %s --check-prefixes=EG
declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 1c5c16d..d8511c8 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -enable-var-scope --check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
; GFX9-LABEL: s_shl_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add.ll b/llvm/test/CodeGen/AMDGPU/shl_add.ll
index bcbf3f6..7af6c8b 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji | FileCheck -check-prefix=VI %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 | FileCheck -check-prefix=GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 | FileCheck -check-prefix=GFX10 %s
; ===================================================================================
; V_LSHL_ADD_U32
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
index 945b92a..806bd994 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
index 47cc014..c0a050c 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
; Test that doing a shift of a pointer with a constant add will be
; folded into the constant offset addressing mode even if the add has
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
index 6541342..d0377b4 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
; GCN-LABEL: {{^}}shl_base_atomicrmw_global_atomic_csub_ptr:
; GCN-DAG: v_lshlrev_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]], 2, v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
index 8ea83da..e83ed89 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
define void @shl_base_atomicrmw_global_ptr(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) #0 {
; GCN-LABEL: shl_base_atomicrmw_global_ptr:
diff --git a/llvm/test/CodeGen/AMDGPU/shl_or.ll b/llvm/test/CodeGen/AMDGPU/shl_or.ll
index 86d97ff..efb28c8 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_or.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_or.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji | FileCheck -check-prefix=VI %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 | FileCheck -check-prefix=GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 | FileCheck -check-prefix=GFX10 %s
; ===================================================================================
; V_LSHL_OR_B32
diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
index 3519bef..98c4868 100644
--- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck %s -check-prefix=GCN
define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 inreg %v, i32 %lane, i32 %f, i32 %f2) #0 {
; GCN-LABEL: should_not_hoist_set_inactive:
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
index ab28054..8efa58d 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=FLAT %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck --check-prefix=FLAT %s
define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) {
; SI-LABEL: uniform_kill:
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
index 522b465..4a863cf 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
@@ -1,6 +1,6 @@
; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -S -passes=structurizecfg,si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
; OPT-LABEL: @annotate_unreachable_noloop(
; OPT-NOT: call i1 @llvm.amdgcn.loop
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
index 58e3ee1..707c308 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
@@ -1,6 +1,6 @@
; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -S -passes=structurizecfg,si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; OPT-LABEL: @annotate_unreachable(
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 745d6b3..e8da10c 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=FLAT %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck --check-prefix=FLAT %s
define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, i32 %a) {
; SI-LABEL: break_inserted_outside_of_loop:
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
index e5047cf..5d5e35f 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=kaveri < %s | FileCheck %s
define amdgpu_kernel void @test(i32 %arg, i32 %arg1) {
; CHECK-LABEL: test:
diff --git a/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
index 2d96011..dfd8166 100644
--- a/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
@@ -1,4 +1,4 @@
-; RUN: llc -o - %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs -stop-after finalize-isel | FileCheck %s
+; RUN: llc -o - %s -mtriple=amdgcn -mcpu=verde -stop-after finalize-isel | FileCheck %s
; This test verifies that the instruction selection will add the implicit
; register operands in the correct order when modifying the opcode of an
; instruction to V_ADD_CO_U32_e32.
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll
index 917743b..44dcbc5 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}if_with_kill:
; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]],
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
index 71bbf86..90a76c3 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator:
; GCN: v_cmp_eq_u32
diff --git a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index b662254..d564e74 100644
--- a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
; These tests check that the compiler won't crash when it needs to spill
; SGPRs.
diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll b/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll
index 88daad2..931f00e 100644
--- a/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
; If this occurs it is likely due to reordering and the restore was
; originally supposed to happen before SI_END_CF.
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 61da875..fb336f4 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefix=CI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefix=CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
%struct.lds = type { [64 x ptr], [16 x i8] }
@stored_lds_struct = addrspace(3) global %struct.lds poison, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index 2dfb72a..4cbe682 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
-; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s
declare void @llvm.trap()
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
index f232275..d20fef3 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck -check-prefix=GCN %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -verify -S %s -o - | FileCheck -check-prefix=IR %s
; A test with a divergent unreachable block and uniform return block. The
diff --git a/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll b/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll
index ee843dc..09f841f 100644
--- a/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s
; CHECK: {{^}}test_8_min_char:
; CHECK: buffer_store_byte
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 3523423..308d87b 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIJI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HAWAII %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIJI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HAWAII %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; FIXME: Why is this commuted only sometimes?
define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index ec03043..cb8bbde 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=SI
-; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=VI
+; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=SI
+; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=VI
define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
; SI-LABEL: s_sext_i1_to_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
index 6ffc8ca..fa482d9 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
@@ -58,7 +58,8 @@ define amdgpu_kernel void @foo(ptr noundef %fp) {
; OW-NEXT: entry:
; OW-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; OW-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8
-; OW-NEXT: call void [[FP]]()
+; OW-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
+; OW-NEXT: call void [[LOAD]]()
; OW-NEXT: ret void
;
; CW-LABEL: define {{[^@]+}}@foo
@@ -66,7 +67,8 @@ define amdgpu_kernel void @foo(ptr noundef %fp) {
; CW-NEXT: entry:
; CW-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; CW-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8
-; CW-NEXT: [[TMP0:%.*]] = icmp eq ptr [[FP]], @bar1
+; CW-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
+; CW-NEXT: [[TMP0:%.*]] = icmp eq ptr [[LOAD]], @bar1
; CW-NEXT: br i1 [[TMP0]], label [[TMP1:%.*]], label [[TMP2:%.*]]
; CW: 1:
; CW-NEXT: call void @bar1()
@@ -86,7 +88,8 @@ define amdgpu_kernel void @foo(ptr noundef %fp) {
; NO-NEXT: entry:
; NO-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; NO-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8
-; NO-NEXT: call void [[FP]](), !callees [[META0:![0-9]+]]
+; NO-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
+; NO-NEXT: call void [[LOAD]](), !callees [[META0:![0-9]+]]
; NO-NEXT: ret void
;
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 8f94b63f..65de7f8 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -19,9 +19,9 @@ define amdgpu_kernel void @test_simple_indirect_call() {
; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] {
; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
-; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8
-; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8
-; ATTRIBUTOR_GCN-NEXT: call void @indirect()
+; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8, !noalias.addrspace [[META0:![0-9]+]]
+; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8, !noalias.addrspace [[META0]]
+; ATTRIBUTOR_GCN-NEXT: call void [[FP]]()
; ATTRIBUTOR_GCN-NEXT: ret void
;
; GFX9-LABEL: test_simple_indirect_call:
@@ -58,7 +58,8 @@ define amdgpu_kernel void @test_simple_indirect_call() {
;.
-;.
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+;.
+; ATTRIBUTOR_GCN: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll b/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll
index eb8c3ca..fa4d699 100644
--- a/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll
+++ b/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN %s
; Test that image.sample LOD(_L), Level 0(_LZ), Derivative(_D) instructions are sunk across the branch and not left in the first block. Since the kill may terminate the shader there might be no need for sampling the image.
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index d71d0f7..6a45b96 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index ebe6b23..d462786 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll
index d4b0dfd..6d4f1b2 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}s_sint_to_fp_i32_to_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index 0b68a05..09596e9 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @sitofp_i16_to_f16(
; SI-LABEL: sitofp_i16_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll b/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll
index 6f76864..e3b8379 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; FIXME: merge with trap.ll
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 6fc92bc..b21c781 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,GFX10-WAVE64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX10-WAVE32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,GFX10-WAVE64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX10-WAVE32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
; GCN-LABEL: test_kill_depth_0_imm_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index ddf6297..a9fb779 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
index d8c015b..1e042d3 100644
--- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.ll b/llvm/test/CodeGen/AMDGPU/sminmax.ll
index 3c49375..dbcb4b7 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=EG,FUNC %s
; FUNC-LABEL: {{^}}s_abs_i32:
; GCN: s_abs_i32
diff --git a/llvm/test/CodeGen/AMDGPU/smrd-gfx10.ll b/llvm/test/CodeGen/AMDGPU/smrd-gfx10.ll
index d122e4d..f68fe736 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd-gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd-gfx10.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX11 %s
; GCN-LABEL: {{^}}smrd_imm_dlc:
; GFX10: s_buffer_load_dword s0, s[0:3], 0x0 dlc ; encoding: [0x00,0x40,0x20,0xf4,0x00,0x00,0x00,0xfa]
diff --git a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
index 5a0ff52..616d928 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}vccz_workaround:
; GCN: s_load_dword [[REG:s[0-9]+]], s[{{[0-9]+:[0-9]+}}],
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index ceb1ce4..0c3b798 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck --check-prefixes=SI,GCN,SICIVI,SICI,SIVIGFX9_10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck --check-prefixes=CI,GCN,SICIVI,SICI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck --check-prefixes=VI,GCN,SICIVI,VIGFX9_10,SIVIGFX9_10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck --check-prefixes=GFX10,GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck --check-prefixes=SI,GCN,SICIVI,SICI,SIVIGFX9_10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck --check-prefixes=CI,GCN,SICIVI,SICI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck --check-prefixes=VI,GCN,SICIVI,VIGFX9_10,SIVIGFX9_10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -show-mc-encoding < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck --check-prefixes=GFX10,GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s
; SMRD load with an immediate offset.
; GCN-LABEL: {{^}}smrd0:
diff --git a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
index 6312816..114d4c3 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GCN
; GCN-LABEL: ; %bb.0:
; GCN: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
index 84aab52..23a0d1dd 100644
--- a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; Since this intrinsic is exposed as a constant after isel, use it to
; defeat the DAG's compare with constant canonicalizations.
diff --git a/llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll b/llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll
index 1aec329..a0ef300 100644
--- a/llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll
+++ b/llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -debug-only=branch-relaxation -verify-machineinstrs < %s 2>&1 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -debug-only=branch-relaxation < %s 2>&1 | FileCheck --check-prefix=GFX10 %s
; GFX10: Basic blocks after relaxation
; GFX10: %bb.0 offset=00000000 size=0x1c
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 6afef91..5484f77 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
; GCN-LABEL: {{^}}max_12regs_13a_used:
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll b/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
index cc42077..c08118f 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=TONGA %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=TONGA %s
; On Tonga and Iceland, limited SGPR availability means care must be taken to
; allocate scratch registers correctly. Check that this test compiles without
diff --git a/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll b/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll
index c3b6d8d..17b2b68 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll
@@ -1,4 +1,4 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -stress-regalloc=6 < %s | FileCheck %s
; Inline spiller can decide to move a spill as early as possible in the basic block.
; It will skip phis and label, but we also need to make sure it skips instructions
diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
index 03988c3..83bf3a7 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -stress-regalloc=1 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}spill_csr_s5_copy:
; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
index 7225402..3e4dbbd 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
@@ -1,7 +1,7 @@
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=TOVGPR -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=TOVGPR -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=TOVMEM -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=TOVMEM -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=TOVGPR -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=TOVGPR -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefix=TOVMEM -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=TOVMEM -check-prefix=GCN %s
; XXX - Why does it like to use vcc?
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index 648b59f..cbc3efc 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -stop-after=greedy,1 -o - %s | FileCheck -check-prefix=GCN %s
; Convert AV spills into VGPR spills by introducing appropriate copies in between.
define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
index 4384d1e..04f73a3 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -o - %s | FileCheck %s
; Regression test for `processFunctionBeforeFrameFinalized`:
; Check that it correctly updates RegisterScavenger so we
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
index e7b61b8..f485b3f 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefixes=GFX908 %s
; GFX908-LABEL: {{^}}max_11_vgprs_used_9a:
; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
index 3c5b333..2b20f9d 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX900 %s
; GCN-LABEL: {{^}}max_11_vgprs:
; GFX900-NOT: SCRATCH_RSRC
diff --git a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
index dd6e9b9..e8e8385 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VGPR %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VMEM %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN -check-prefix=VGPR %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VMEM %s
; GCN-LABEL: {{^}}spill_sgpr_x2:
diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
index 241bab3..7ec4620 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -o - %s | FileCheck -check-prefix=GCN %s
; Callee must preserve the VGPR modified by writelane even if it is marked Caller-saved.
diff --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
index fba8545..e962d1ba 100644
--- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
+; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -enable-var-scope %s
define void @spill_more_than_wavesize_csr_sgprs() {
; CHECK-LABEL: spill_more_than_wavesize_csr_sgprs:
diff --git a/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
index 00c2a9d..dbecdb2 100644
--- a/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare i32 @llvm.amdgcn.workitem.id.x() readnone
diff --git a/llvm/test/CodeGen/AMDGPU/split-smrd.ll b/llvm/test/CodeGen/AMDGPU/split-smrd.ll
index dbb621d..6d17944 100644
--- a/llvm/test/CodeGen/AMDGPU/split-smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-smrd.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
; FIXME: Move this to sgpr-copy.ll when this is fixed on VI.
; Make sure that when we split an smrd instruction in order to move it to
diff --git a/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll b/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
index a6366cc..e6fa533 100644
--- a/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -verify-machineinstrs -mattr=-promote-alloca,-load-store-opt,-enable-ds128 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=-promote-alloca,-load-store-opt,-enable-ds128 < %s | FileCheck -check-prefix=GCN %s
@sPrivateStorage = internal addrspace(3) global [256 x [8 x <4 x i64>]] poison
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index 0b49b9c..5d169c1 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s -check-prefixes=EG
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=verde < %s | FileCheck %s -check-prefixes=SI
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s -check-prefixes=VI
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600-- -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index a6b8ea3..f614f58 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s --check-prefixes=TAHITI
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefixes=TONGA
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s --check-prefixes=EG
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck %s --check-prefixes=TAHITI
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s --check-prefixes=TONGA
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck %s --check-prefixes=EG
define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-LABEL: srem_i16_7:
@@ -1819,7 +1819,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TAHITI-NEXT: v_mul_hi_u32 v1, v0, v1
; TAHITI-NEXT: v_mul_lo_u32 v1, v1, v2
; TAHITI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2
+; TAHITI-NEXT: v_subrev_i32_e32 v1, vcc, v2, v0
; TAHITI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; TAHITI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2
@@ -6232,7 +6232,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_mul_hi_u32 v8, v14, v8
; TONGA-NEXT: v_mul_lo_u32 v8, v8, v10
; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v14, v8
-; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10
+; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v8
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10
; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 33c2ce6..e64e3de 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s
define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-LABEL: s_test_srem:
diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 239de43..c05f341 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck %s -check-prefixes=SI
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s -check-prefixes=VI
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll
index ed8b442..053038d 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll
@@ -1,6 +1,6 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index dcf0d3d1..477297b 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=MUBUF %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs | FileCheck -check-prefix=FLATSCR %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=MUBUF11 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+enable-flat-scratch -verify-machineinstrs | FileCheck -check-prefix=FLATSCR11 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefix=MUBUF %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -mattr=+enable-flat-scratch | FileCheck -check-prefix=FLATSCR %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 | FileCheck -check-prefix=MUBUF11 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+enable-flat-scratch | FileCheck -check-prefix=FLATSCR11 %s
; During instruction selection, we use immediate const zero for soffset in
; MUBUF stack accesses and let eliminateFrameIndex to fix up this field to use
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 4ddde7f..9cb22da 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
; Check that we properly realign the stack. While 4-byte access is all
; that is ever needed, some transformations rely on the known bits from the alignment of the pointer (e.g.
diff --git a/llvm/test/CodeGen/AMDGPU/store-barrier.ll b/llvm/test/CodeGen/AMDGPU/store-barrier.ll
index af48d7e..163821f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-barrier.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mattr=+load-store-opt -enable-misched < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire -mattr=+load-store-opt -enable-misched < %s | FileCheck %s
; This test is for a bug in the machine scheduler where stores without
; an underlying object would be moved across the barrier. In this
diff --git a/llvm/test/CodeGen/AMDGPU/store-global.ll b/llvm/test/CodeGen/AMDGPU/store-global.ll
index 1ff9b11..8abd29b 100644
--- a/llvm/test/CodeGen/AMDGPU/store-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-global.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=FUNC %s
-; RUN: llc -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=verde < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600-- -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600-- -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}store_i1:
; EG: MEM_RAT MSKOR
diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
index 470873f..994f353 100644
--- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-MUBUF %s
-; RxN: llc -mtriple=amdgcn -mcpu=gfx906 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-MUBUF %s
+; RxN: llc -mtriple=amdgcn -mcpu=gfx906 -mattr=-promote-alloca,+sram-ecc < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-promote-alloca < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
; GCN-LABEL: {{^}}store_global_hi_v2i16:
; GCN: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
index 2efa022..a4e23ae 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) {
; GFX9-LABEL: store_lds_v4i32:
diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll
index 03a7ec4..3034711 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) {
; GFX9-LABEL: store_lds_v3i32:
diff --git a/llvm/test/CodeGen/AMDGPU/store-local.ll b/llvm/test/CodeGen/AMDGPU/store-local.ll
index 76e2d43..e4a0465 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,VI,FUNC %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=verde < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,SICIVI,VI,FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
; RUN: llc -mtriple=r600-- -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
; RUN: llc -mtriple=r600-- -mcpu=cayman < %s | FileCheck -check-prefixes=CM,FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/store-private.ll b/llvm/test/CodeGen/AMDGPU/store-private.ll
index 1c4ac88..7a5c50b 100644
--- a/llvm/test/CodeGen/AMDGPU/store-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-private.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -mtriple=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}store_i1:
; EG: MOVA_INT
diff --git a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
index 86b11e4..922ef84 100644
--- a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}global_store_v3i64:
; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll b/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll
index 85f76a0..eb5bb5f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs< %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs< %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s
; This tests for a bug that caused a crash in
; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 92918f19..7d98f7f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,HAWAII %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,FIJI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,HAWAII %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,FIJI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 {
; CIVI-LABEL: local_store_i56:
diff --git a/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll b/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
index e56226f..fe0fedb 100644
--- a/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs <%s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 <%s | FileCheck -check-prefixes=GCN %s
;
; This test checks that we have the correct fold for zext(cc1) - zext(cc2).
;
diff --git a/llvm/test/CodeGen/AMDGPU/sub.i16.ll b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
index 93a7108..6ee6a04 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_sub_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index ec065b4..5c113d8 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 9f539bd..cd1c532 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16
; FIXME: Need to handle non-uniform case for function below (load without gep).
define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/sub_i1.ll b/llvm/test/CodeGen/AMDGPU/sub_i1.ll
index 8e65e64..08ca848 100644
--- a/llvm/test/CodeGen/AMDGPU/sub_i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub_i1.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_kernel void @sub_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
; GFX9-LABEL: sub_var_var_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/sub_u64.ll b/llvm/test/CodeGen/AMDGPU/sub_u64.ll
new file mode 100644
index 0000000..baaca4dd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sub_u64.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
+
+define amdgpu_ps <2 x float> @test_sub_u64_vv(i64 %a, i64 %b) {
+; GFX12-LABEL: test_sub_u64_vv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 %a, %b
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_vs(i64 %a, i64 inreg %b) {
+; GFX12-LABEL: test_sub_u64_vs:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_subrev_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_vs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u64_e64 v[0:1], v[0:1], s[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 %a, %b
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_sv(i64 inreg %a, i64 %b) {
+; GFX12-LABEL: test_sub_u64_sv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_sv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 %a, %b
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_ss(i64 inreg %a, i64 inreg %b) {
+; GCN-LABEL: test_sub_u64_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+ %sub = sub i64 %a, %b
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_inline_lit_v(i64 %a) {
+; GFX12-LABEL: test_sub_u64_inline_lit_v:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, 5, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_inline_lit_v:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], 5, v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 5, %a
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_v_inline_lit(i64 %a) {
+; GFX12-LABEL: test_sub_u64_v_inline_lit:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, -5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_v_inline_lit:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], -5, v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 %a, 5
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_small_imm_v(i64 %a) {
+; GFX12-LABEL: test_sub_u64_small_imm_v:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, 0x1f4, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_small_imm_v:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], 0x1f4, v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 500, %a
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_64bit_imm_v(i64 %a) {
+; GFX12-LABEL: test_sub_u64_64bit_imm_v:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, 0x3b9ac9ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, 1, v1, vcc_lo
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_64bit_imm_v:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], lit64(0x13b9ac9ff), v[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
+ %sub = sub i64 5294967295, %a
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @test_sub_u64_small_imm_s(i64 inreg %a) {
+; GCN-LABEL: test_sub_u64_small_imm_s:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_sub_nc_u64 s[0:1], 0x1f4, s[0:1]
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+ %sub = sub i64 500, %a
+ %ret = bitcast i64 %sub to <2 x float>
+ ret <2 x float> %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/swdev373493.ll b/llvm/test/CodeGen/AMDGPU/swdev373493.ll
index d2d6fdc..c1e83e6 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev373493.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev373493.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -o - %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck %s
+; RUN: llc -o - %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a | FileCheck %s
@global = external protected addrspace(4) externally_initialized global [4096 x i64], align 16
diff --git a/llvm/test/CodeGen/AMDGPU/switch-default-block-unreachable.ll b/llvm/test/CodeGen/AMDGPU/switch-default-block-unreachable.ll
index 094ca2a..bc84614 100644
--- a/llvm/test/CodeGen/AMDGPU/switch-default-block-unreachable.ll
+++ b/llvm/test/CodeGen/AMDGPU/switch-default-block-unreachable.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s
define void @test(i1 %c0) #1 {
; Clean up the unreachable blocks introduced with LowerSwitch pass.
; This test ensures that, in the pass flow, UnreachableBlockElim pass
diff --git a/llvm/test/CodeGen/AMDGPU/switch-unreachable.ll b/llvm/test/CodeGen/AMDGPU/switch-unreachable.ll
index 5b00296..5ae4bc2d 100644
--- a/llvm/test/CodeGen/AMDGPU/switch-unreachable.ll
+++ b/llvm/test/CodeGen/AMDGPU/switch-unreachable.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck %s
; This testcase was discovered in si-annotate-cf.ll, where none of the
; RUN lines was actually exercising it. See that files git log for its
diff --git a/llvm/test/CodeGen/AMDGPU/swizzle.bit.extract.ll b/llvm/test/CodeGen/AMDGPU/swizzle.bit.extract.ll
index c5763c6..c8c53e9 100644
--- a/llvm/test/CodeGen/AMDGPU/swizzle.bit.extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/swizzle.bit.extract.ll
@@ -1,7 +1,7 @@
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck %s --check-prefixes=GCN,PREGFX12-SDAG
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck %s --check-prefixes=GCN,PREGFX12-GISEL
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck %s --check-prefixes=GCN,GFX12PLUS-SDAG
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck %s --check-prefixes=GCN,GFX12PLUS-GISEL
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -stop-after=amdgpu-isel -o - %s | FileCheck %s --check-prefixes=GCN,PREGFX12-SDAG
+; RUN: llc -global-isel=1 -new-reg-bank-select -march=amdgcn -mcpu=tahiti -stop-after=instruction-select -o - %s | FileCheck %s --check-prefixes=GCN,PREGFX12-GISEL
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-isel -o - %s | FileCheck %s --check-prefixes=GCN,GFX12PLUS-SDAG
+; RUN: llc -global-isel=1 -new-reg-bank-select -march=amdgcn -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck %s --check-prefixes=GCN,GFX12PLUS-GISEL
; GCN-LABEL: name: buffer_swizzle_bit_pregfx12
; PREGFX12-SDAG: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN {{%[0-9]+}}, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, 0, 1, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
index 88c1fd9..dcaa46a 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-SELDAG -enable-var-scope %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-GISEL -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GCN-SELDAG -enable-var-scope %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL -enable-var-scope %s
; Callee with VGPR arguments
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
index 80dae91..2b1f638 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
declare hidden void @void_func_i32_inreg(i32 inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll
index da32ac0..4068ea7 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
; The tail call target is known uniform, but will be in a VGPR, so we
; need readfirstlane to legalize it.
diff --git a/llvm/test/CodeGen/AMDGPU/target-cpu.ll b/llvm/test/CodeGen/AMDGPU/target-cpu.ll
index 3119c32d..74eddf0 100644
--- a/llvm/test/CodeGen/AMDGPU/target-cpu.ll
+++ b/llvm/test/CodeGen/AMDGPU/target-cpu.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -disable-promote-alloca-to-vector -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -disable-promote-alloca-to-vector < %s | FileCheck %s
declare ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #1
diff --git a/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll
index 28d40cd..89ddcac 100644
--- a/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-TFILD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -combiner-tokenfactor-inline-limit=7 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-TFIL7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-TFILD %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -combiner-tokenfactor-inline-limit=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-TFIL7 %s
; GCN-LABEL: {{^}}token_factor_inline_limit_test:
diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index 11ba2fd..0cf26be 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -stop-after=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s
; If the block containing the SI_RETURN_TO_EPILOG is not the last block, insert an empty block at the end and
; insert an unconditional jump there.
define amdgpu_ps float @simple_test_return_to_epilog(float %a) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 69cc63e..469ea24 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc %s -o - -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=NOHSA-TRAP-GFX900 %s
-; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX803 %s
-; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900 %s
-; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900 %s
-; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX1100 %s
-; RUN: llc %s -o - -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX1100-O0 %s
+; RUN: llc %s -o - -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=NOHSA-TRAP-GFX900 %s
+; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=HSA-TRAP-GFX803 %s
+; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA-TRAP-GFX900 %s
+; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler | FileCheck --check-prefix=HSA-NOTRAP-GFX900 %s
+; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 | FileCheck --check-prefix=HSA-TRAP-GFX1100 %s
+; RUN: llc %s -o - -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 | FileCheck --check-prefix=HSA-TRAP-GFX1100-O0 %s
declare void @llvm.trap() #0
declare void @llvm.debugtrap() #1
diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll
index 9bab3e6..9c7f393 100644
--- a/llvm/test/CodeGen/AMDGPU/trap.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap.ll
@@ -1,27 +1,27 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
; enable trap handler feature
-; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
; disable trap handler feature
-; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
; GCN-WARNING: warning: <unknown>:0:0: in function hsa_debugtrap void (ptr addrspace(1)): debugtrap handler not supported
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
index def8d7e..f5c8cdb 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
define amdgpu_kernel void @trunc_i64_bitcast_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: trunc_i64_bitcast_v2i32:
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
index da5ec09..2d1c85e 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=VI %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index dd3499e..cf84465 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI
-; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
; Make sure high constant 0 isn't pointlessly materialized
define i16 @trunc_bitcast_i64_lshr_32_i16(i64 %bar) {
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
index 803d7bf..c6b5ae4 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}global_truncstore_f64_to_f16:
; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
index b42af2f..ecc1def 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
; GCN-LABEL: {{^}}global_truncstore_i32_to_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll
index 7dae26f..083e600 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}trunc_store_v4i64_v4i8:
; GCN: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-vec-i16-to-i8.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-vec-i16-to-i8.ll
index b2b9055..5f88e60 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-vec-i16-to-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-vec-i16-to-i8.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}short_char:
; GCN: global_store_byte v
diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index 8d17a01..76f60f1 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
index ef2eca8..8629d54 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -stop-after twoaddressinstruction < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after twoaddressinstruction < %s | FileCheck %s
; Check that %16 gets constrained to register class sgpr_96_with_sub0_sub1.
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) {
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index 6606b1d..d230ff5 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
; SI-LABEL: s_uaddo_i64_zext:
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 04b9873..063c56f 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=VI
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck %s -check-prefixes=SI
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=VI
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GCN
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GFX1030
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index e901793..bc9a3f2 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s
define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-LABEL: s_test_udiv_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index 74e536f..eaab353 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr addrspace(1) %out1, [8 x i32], i32 %x, [8 x i32], i32 %y) {
; R600-LABEL: test_udivrem:
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
index dc58843..5477d62 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem24.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}udiv24_i8:
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 97738a79..ab278c3 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 5b1a520..d25178f 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll
index b3d5894..4603efb 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}s_uint_to_fp_i32_to_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index eb1b844..9bcba6c 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @uitofp_i16_to_f16(
; SI-LABEL: uitofp_i16_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll
index 4726e81..9d8a45a 100644
--- a/llvm/test/CodeGen/AMDGPU/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/umed3.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
index fc33a27..15065eb 100644
--- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,ALIGNED %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -mattr=+unaligned-access-mode -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,UNALIGNED %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,ALIGNED %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FLATSCR,ALIGNED %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI,MUBUF,ALIGNED %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=SI,MUBUF,UNALIGNED %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=SI,MUBUF,ALIGNED %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=SI,FLATSCR,ALIGNED %s
; SI-LABEL: {{^}}local_unaligned_load_store_i16:
; SI: ds_read_u8
diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index 78103d5..31708a9 100644
--- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -early-live-intervals < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -early-live-intervals < %s | FileCheck %s
; We may have subregister live ranges that are undefined on some paths. The
; verifier should not complain about this.
diff --git a/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
index c88499d..1813acf 100644
--- a/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
+++ b/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
@@ -1,6 +1,6 @@
-; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
-; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
-; XUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s
+; RUN: llc -O0 -asm-verbose=0 -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -O0 -asm-verbose=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; XUN: llc -O0 -asm-verbose=0 -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s
; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll b/llvm/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll
index 7417f86..c8d3148 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s
; This used to raise an assertion due to how the choice between uniform and
; non-uniform branches was determined.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index 374c670..5108159 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=verde -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -structurizecfg-skip-uniform-regions -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -structurizecfg-skip-uniform-regions -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -structurizecfg-skip-uniform-regions < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -structurizecfg-skip-uniform-regions < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
define amdgpu_kernel void @uniform_if_scc(i32 %cond, ptr addrspace(1) %out) {
; SI-LABEL: uniform_if_scc:
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-crash.ll b/llvm/test/CodeGen/AMDGPU/uniform-crash.ll
index e8790f0..1aea988 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-crash.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck --check-prefix=GCN %s
; GCN-LABEL: {{^}}icmp_2_users:
; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 1
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
index c3dcc78..90891cb 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,W32 --enable-var-scope %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,W64 --enable-var-scope %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GCN,W32 --enable-var-scope %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,W64 --enable-var-scope %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -S -amdgpu-annotate-uniform < %s | FileCheck --check-prefixes=OPT,OPT-W32 --enable-var-scope %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 -S -amdgpu-annotate-uniform < %s | FileCheck --check-prefixes=OPT,OPT-W64 --enable-var-scope %s
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll b/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll
index ea127b7..ab26402 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -o - %s | FileCheck --check-prefix=GCN %s
;
; This test shows a typical case that a PHI(%c2) in join block was treated as uniform
; as it has one unique uniform incoming value plus one additional undef incoming
diff --git a/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll b/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
index 3bc6e3d..fd7e9f0 100644
--- a/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s
;
; This test used to crash with the following assertion:
; llc: include/llvm/ADT/IntervalMap.h:632: unsigned int llvm::IntervalMapImpl::LeafNode<llvm::SlotIndex, llvm::LiveInterval *, 8, llvm::IntervalMapInfo<llvm::SlotIndex> >::insertFrom(unsigned int &, unsigned int, KeyT, KeyT, ValT) [KeyT = llvm::SlotIndex, ValT = llvm::LiveInterval *, N = 8, Traits = llvm::IntervalMapInfo<llvm::SlotIndex>]: Assertion `(i == Size || Traits::stopLess(b, start(i))) && "Overlapping insert"' failed.
diff --git a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
index 33ac697..6b317de 100644
--- a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
+++ b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-- -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
-; RUN: llc -mtriple=r600-- -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=unknown < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
+; RUN: llc -mtriple=r600-- -mcpu=unknown < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
; Should not crash when the processor is not recognized and the
; wavefront size feature not set.
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-half.ll b/llvm/test/CodeGen/AMDGPU/unpack-half.ll
index b4519d5..d9f28be 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-half.ll
+++ b/llvm/test/CodeGen/AMDGPU/unpack-half.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
; On gfx6 and gfx7, this test shows a bug in SelectionDAG where scalarizing the
; extension of a vector of f16 generates an illegal node that errors later.
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index e0d1698..25e8581 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn-amdhsa -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
-; RUN: opt -S -si-annotate-control-flow -mtriple=amdgcn-amdhsa -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI-OPT %s
+; RUN: llc -mtriple=amdgcn-amdhsa -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -si-annotate-control-flow -mtriple=amdgcn-amdhsa -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI-OPT %s
define hidden void @widget() {
; GCN-LABEL: widget:
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
index b762226..721114e 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
@@ -1,5 +1,5 @@
-; RUN: not llc -mtriple=amdgcn-mesa-mesa3d -tailcallopt -verify-machineinstrs=0 < %s 2>&1 | FileCheck --check-prefix=GCN %s
-; RUN: not llc -mtriple=amdgcn--amdpal -tailcallopt -verify-machineinstrs=0 < %s 2>&1 | FileCheck --check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn-mesa-mesa3d -tailcallopt < %s 2>&1 | FileCheck --check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn--amdpal -tailcallopt < %s 2>&1 | FileCheck --check-prefix=GCN %s
; RUN: not llc -mtriple=r600-- -mcpu=cypress -tailcallopt < %s 2>&1 | FileCheck -check-prefix=R600 %s
declare i32 @external_function(i32) nounwind
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-cs-chain.ll b/llvm/test/CodeGen/AMDGPU/unsupported-cs-chain.ll
index 1cbf904..ae27152 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-cs-chain.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-cs-chain.ll
@@ -1,5 +1,5 @@
-; RUN: not llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs=0 < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs=0 < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -global-isel=1 -mattr=+wavefrontsize64 < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -global-isel=0 -mattr=+wavefrontsize64 < %s 2>&1 | FileCheck %s
declare amdgpu_cs_chain void @callee() nounwind
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-image-a16.ll b/llvm/test/CodeGen/AMDGPU/unsupported-image-a16.ll
index 40f1664..c009283 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-image-a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-image-a16.ll
@@ -1,5 +1,5 @@
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
-; RUN: not llc -global-isel=1 -global-isel-abort=1 -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=fiji -o /dev/null %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
+; RUN: not llc -global-isel=1 -global-isel-abort=1 -mtriple=amdgcn -mcpu=fiji -o /dev/null %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
; Make sure this doesn't assert on targets without the r128-16
; feature, and instead generates a selection error.
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll b/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll
index df91887..cd96298 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll
@@ -1,8 +1,8 @@
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=fiji -o /dev/null %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -o /dev/null %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
-; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
-; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=fiji -o /dev/null %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -o /dev/null %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
; Make sure this doesn't assert on targets without the g16 feature, and instead
; generates a selection error.
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll b/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll
index b3cf379..b61abc8 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll
@@ -1,10 +1,10 @@
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: not llc -O0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX90A %s
-; RUN: not llc -O0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX942 %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1030 %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100 %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: not llc -O0 -mtriple=amdgcn -mcpu=gfx90a < %s 2>&1 | FileCheck -check-prefixes=GFX90A %s
+; RUN: not llc -O0 -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=GFX942 %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030 %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100 %s
; GFX9-LABEL: image_sample_test:
; GFX9: image_sample_lz
diff --git a/llvm/test/CodeGen/AMDGPU/urem.ll b/llvm/test/CodeGen/AMDGPU/urem.ll
index 4b8127f..2893952 100644
--- a/llvm/test/CodeGen/AMDGPU/urem.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; The code generated by urem is long and complex and may frequently
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 6480a88..464dad8 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s
define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-LABEL: s_test_urem_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index a53532f..f50576e 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
declare float @llvm.fma.f32(float, float, float) #1
declare double @llvm.fma.f64(double, double, double) #1
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 2f4f081..7d7f1b4 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/v1024.ll b/llvm/test/CodeGen/AMDGPU/v1024.ll
index a66c4ef..ada0dab 100644
--- a/llvm/test/CodeGen/AMDGPU/v1024.ll
+++ b/llvm/test/CodeGen/AMDGPU/v1024.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s
; Check that we do not use AGPRs for v32i32 type
diff --git a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
index 2e52e51..f95bc0b 100644
--- a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
define amdgpu_kernel void @sdwa_test() local_unnamed_addr #0 {
; GFX9-LABEL: sdwa_test:
; GFX9: ; %bb.0: ; %bb
diff --git a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
index 684ab80..aea2a8b 100644
--- a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950 %s
define amdgpu_kernel void @v_ashr_pk_i8_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
; GFX950-LABEL: v_ashr_pk_i8_i32:
; GFX950: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
index bff5c6c..a6a4069 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+wavefrontsize64 --global-isel=0 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+wavefrontsize64 --global-isel=0 -mcpu=gfx1100 < %s | FileCheck -check-prefixes=CHECK %s
define amdgpu_kernel void @icmp_test() {
; CHECK-LABEL: icmp_test:
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index a41063f..b314cf2 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare half @llvm.fabs.f16(half)
diff --git a/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll b/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
index 5a4d079..8179c0f 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.cvt.pk.u8.f32(float, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll
index 7fe33d5..c128715 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll
@@ -1,6 +1,6 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mattr=+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-FLUSH,GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mattr=+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefixes=SI,GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global < %s | FileCheck --check-prefixes=VI-FLUSH,GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}mac_vvv:
; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 glc{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
index d7a837a..bcc60b0 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}mac_f16:
; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index 580938f..3afe55f 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-TRUE16
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global | FileCheck %s --check-prefixes=GFX11,GFX11-TRUE16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
define amdgpu_kernel void @madak_f16(
; SI-LABEL: madak_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 8a88298..d8044139 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-FAKE16 %s
-; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-REAL16 %s
-; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-REAL16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-FAKE16 %s
+; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-REAL16 %s
+; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-REAL16 %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 6ab3022..6b5bae0 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -1,17 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=SDAG-VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=SDAG-GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-FAKE16 %s
; <GFX9 has no V_SAT_PK, GFX9+ has V_SAT_PK, GFX11 has V_SAT_PK with t16
diff --git a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
index 83f0778..92bc01e 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
define amdgpu_kernel void @sdwa_test_sub() local_unnamed_addr #0 {
; GFX9-LABEL: sdwa_test_sub:
; GFX9: ; %bb.0: ; %bb
diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
index 79ec4b8..490046c 100644
--- a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
define half @swap(half %a, half %b, i32 %i) {
; GFX11-TRUE16-LABEL: swap:
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index c500565..0f368ff 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs -enable-misched -asm-verbose -disable-block-placement -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -enable-misched -asm-verbose -disable-block-placement -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
index 33ca718..3c32cba 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
define i32 @test_insert_vcmpx_pattern_lt(i32 %x) {
; GFX1010-LABEL: test_insert_vcmpx_pattern_lt:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index de94ee9..9c05f4d 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -1,6 +1,6 @@
; RUN: opt -S -mtriple=amdgcn- -passes=sroa %s -o %t.sroa.ll
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
; RUN: opt -S -mtriple=amdgcn-- -passes='sroa,amdgpu-promote-alloca,instcombine' < %s | FileCheck -check-prefix=OPT %s
; OPT-LABEL: @vector_read_alloca_bitcast(
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
index a3e0dbe..1a08bbd 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn-- -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=verde -mattr=-promote-alloca < %s | FileCheck -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=verde -mattr=+promote-alloca < %s | FileCheck -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=FUNC %s
; RUN: llc -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
diff --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
index bee2b70..b445c1e 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
; Test that when extracting the same unknown vector index from an
; insertelement the dynamic indexing is folded away.
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 58602a1..2f25a93 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GX900 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
; GFX9-LABEL: shuffle_v4f16_23uu:
diff --git a/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll b/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll
index a0e87d7..55904eb 100644
--- a/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll
@@ -1,4 +1,4 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s
; CHECK-DAG: flat_load_dwordx4
; CHECK-DAG: flat_load_dwordx4
; CHECK-DAG: flat_load_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll b/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
index 5abaf06..68cc080 100644
--- a/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}load_idx_idy:
; GCN-NOT: global_load
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
index 2ee62d1..e0dfdba 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX900 %s
-; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 %s
-; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90a %s
+; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx900 -O0 < %s | FileCheck -check-prefix=GFX900 %s
+; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx906 -O0 < %s | FileCheck -check-prefix=GFX906 %s
+; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx908 -O0 < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -O0 < %s | FileCheck -check-prefix=GFX90a %s
; This test used to crash for gfx908 while allocating the tuple. Compared to the other subtargets,
; gfx908 marks an extra VGPR reserved for AGPR to VGPR copy that puts more register pressure.
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index aea25b3..371ae03 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-opt-vgpr-liverange=true -stop-after=si-opt-vgpr-liverange -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-opt-vgpr-liverange=true -stop-after=si-opt-vgpr-liverange < %s | FileCheck -check-prefix=SI %s
; a normal if-else
define amdgpu_ps float @else1(i32 %z, float %v) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
index c0b56d0..b46f5f5 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-opt-vgpr-liverange=true -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-opt-vgpr-liverange=true < %s | FileCheck -check-prefix=SI %s
; a normal if-else
define amdgpu_ps float @else1(i32 %z, float %v) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index a69ada2..bca7a21 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -1,9 +1,9 @@
; XFAIL: *
-; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s
-; RUN: llc -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
-; RUN: llc -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s
+; RUN: llc -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
+; RUN: llc -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
; This ends up using all 256 registers and requires register
; scavenging which will fail to find an unsued register.
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 8dfd841..db49339 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; This ends up using all 255 registers and requires register
; scavenging which will fail to find an unsued register.
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index ebf6bd1..14f222a 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
declare void @extern_func() #2
diff --git a/llvm/test/CodeGen/AMDGPU/vop-shrink.ll b/llvm/test/CodeGen/AMDGPU/vop-shrink.ll
index bfa106e..83c0ef7 100644
--- a/llvm/test/CodeGen/AMDGPU/vop-shrink.ll
+++ b/llvm/test/CodeGen/AMDGPU/vop-shrink.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; Test that we correctly commute a sub instruction
; FUNC-LABEL: {{^}}sub_rev:
diff --git a/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll b/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll
index a6dcbb5..6a9fbcf 100644
--- a/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_cs void @_amdgpu_cs_main(i32 %0) {
; GFX11-LABEL: _amdgpu_cs_main:
diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll
index 4ce71e1..6291600 100644
--- a/llvm/test/CodeGen/AMDGPU/vselect.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-;RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck --check-prefixes=SI %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=VI %s
+;RUN: llc < %s -mtriple=amdgcn | FileCheck --check-prefixes=SI %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck --check-prefixes=VI %s
;RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck --check-prefixes=EG %s
define amdgpu_kernel void @test_select_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x i32> %val) {
diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
index 8f2ade7..77dc32d 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
+++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX12-LABEL: intrinsic_store_system_scope:
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index f4b9523..af8b9e7 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -966,3 +966,45 @@ body: |
$vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
$sgpr0 = S_MOV_B32 0
...
+
+# TODO: Unnecessary wait before overwriting vgpr0.
+---
+name: overwrite_vgpr_after_smem
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1
+ ; GCN-LABEL: name: overwrite_vgpr_after_smem
+ ; GCN: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+...
+
+# TODO: Unnecessary wait before overwriting sgpr0.
+---
+name: overwrite_sgpr_after_vmem
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1
+ ; GCN-LABEL: name: overwrite_sgpr_after_vmem
+ ; GCN: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll
index 8d88a115..10090e3 100644
--- a/llvm/test/CodeGen/AMDGPU/wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/wait.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT
-; RUN: llc -mtriple=amdgcn --misched=ilpmax -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX
-; RUN: llc -mtriple=amdgcn --misched=ilpmax -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT
+; RUN: llc -mtriple=amdgcn --misched=ilpmax < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX
+; RUN: llc -mtriple=amdgcn --misched=ilpmax -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX
; The ilpmax scheduler is used for the second test to get the ordering we want for the test.
; DEFAULT-LABEL: {{^}}main:
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index a376262..f3cb5a7 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx802 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-back-off-barrier -asm-verbose=0 < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) {
; GFX8-LABEL: barrier_vmcnt_global:
diff --git a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll
index 6133cb4..ddb6afa 100644
--- a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll
+++ b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefix=GFX906 %s
declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32 immarg) #0
declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #1
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 4212fd3..097154e 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s
define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) {
; GFX1032-LABEL: test_vopc_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
index a798dc1..76c331c 100644
--- a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
; This compute shader has input args that claim that it has 17 sgprs and 5 vgprs
; in wave dispatch. Ensure that the sgpr and vgpr counts in COMPUTE_PGM_RSRC1
diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll
index 9bb8a2f..19c8e84 100644
--- a/llvm/test/CodeGen/AMDGPU/while-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/while-break.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=GCN
define amdgpu_ps float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 {
; GCN-LABEL: while_break:
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir
new file mode 100644
index 0000000..93f4891
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir
@@ -0,0 +1,448 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=prologepilog -o - %s | FileCheck %s
+
+---
+name: save_inactive_lanes_non_csr_vgpr
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: save_inactive_lanes_non_csr_vgpr
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+ ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc
+ ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+ renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+
+...
+---
+name: save_all_lanes_csr_vgpr
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: save_all_lanes_csr_vgpr
+ ; CHECK: liveins: $vgpr40
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr0 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: $vgpr40 = V_MOV_B32_e32 14, implicit $exec
+ ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0
+ renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ $vgpr40 = V_MOV_B32_e32 14, implicit $exec
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0
+
+...
+---
+name: save_csr_sgpr_to_non_csr_vgpr
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr20, $vgpr191
+ ; CHECK-LABEL: name: save_csr_sgpr_to_non_csr_vgpr
+ ; CHECK: liveins: $sgpr20, $vgpr191, $vgpr192
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr192, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+ ; CHECK-NEXT: $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192
+ ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec
+ ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0
+ ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc
+ ; CHECK-NEXT: $vgpr192 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+ $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192
+ renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ $sgpr20 = S_MOV_B32 14, implicit $exec
+ $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+
+...
+---
+name: save_csr_sgpr_to_csr_vgpr
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr20, $vgpr191
+ ; CHECK-LABEL: name: save_csr_sgpr_to_csr_vgpr
+ ; CHECK: liveins: $sgpr20, $vgpr191
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vcc_lo = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr191, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+ ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec
+ ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+ ; CHECK-NEXT: $vgpr191 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+ $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+ renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ $sgpr20 = S_MOV_B32 14, implicit $exec
+ $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+
+...
+---
+name: vgpr_and_sgpr_csr
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+liveins:
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr1' }
+frameInfo:
+ maxAlignment: 4
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ hasSpilledSGPRs: true
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ spillPhysVGPRs:
+ - '$vgpr191'
+ wwmReservedRegs:
+ - '$vgpr191'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191
+
+ ; CHECK-LABEL: name: vgpr_and_sgpr_csr
+ ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+ ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+ ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc
+ ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+ $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+ renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+ S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+ $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+
+...
+---
+name: split_orig_exec
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+liveins:
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr1' }
+frameInfo:
+ maxAlignment: 4
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ hasSpilledSGPRs: true
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ spillPhysVGPRs:
+ - '$vgpr191'
+ wwmReservedRegs:
+ - '$vgpr191'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191
+
+ ; CHECK-LABEL: name: split_orig_exec
+ ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+ ; CHECK-NEXT: $sgpr3 = COPY $vcc_lo
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+ ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+ ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr3, -1, implicit-def $scc
+ ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr3
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3
+ $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+ renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+ $sgpr3 = COPY $vcc_lo
+ S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+ $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3
+
+...
+---
+name: vgpr_superregs
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: vgpr_superregs
+ ; CHECK: liveins: $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr40, $vgpr41, $vgpr42
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr41, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr42, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5)
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42
+ ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5)
+ ; CHECK-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5)
+ ; CHECK-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc
+ ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+ ; CHECK-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+ ; CHECK-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
+ ; CHECK-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+ renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+ S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+
+...
+---
+name: dont_restore_used_vgprs
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+liveins:
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr20' }
+ - { reg: '$vgpr40' }
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr20, $vgpr40
+
+ ; CHECK-LABEL: name: dont_restore_used_vgprs
+ ; CHECK: liveins: $vgpr0, $vgpr20, $vgpr40
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+ renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+
+...
+---
+name: multiple_blocks
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+liveins:
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr1' }
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ ; CHECK-LABEL: name: multiple_blocks
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+ ; CHECK-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
+ ; CHECK-NEXT: V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc
+ ; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec
+ ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc
+ ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0
+ bb.0:
+ successors: %bb.1, %bb.2
+ liveins: $vgpr0, $vgpr1
+
+ renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ $sgpr1 = S_MOV_B32 $exec_lo
+ V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+
+ renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec
+
+ bb.2:
+ liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+
+ $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc
+ renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
new file mode 100644
index 0000000..a13a68a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -0,0 +1,2414 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck --check-prefix=DAGISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck --check-prefix=GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=DAGISEL64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GISEL64 %s
+
+; Make sure the i1 %active is passed through EXEC.
+; The EXEC mask should be set to -1 for the duration of the function
+; and restored to its original value in the epilogue.
+; We will also need to restore the inactive lanes for any allocated VGPRs.
+define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: basic_test:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: basic_test:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: basic_test:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: basic_test:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %x = select i1 %active, i32 %a, i32 5
+ %y = select i1 %active, i32 %b, i32 3
+ %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i32 %ret
+}
+
+; Make sure we don't crash if there's only one use for %active.
+define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: single_use_of_active:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: single_use_of_active:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: single_use_of_active:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: single_use_of_active:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %y = select i1 %active, i32 %b, i32 17
+ %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %a, i32 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i32 %ret
+}
+
+; Make sure we don't crash if %active is not used at all.
+define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: unused_active:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: v_mov_b32_e32 v0, 14
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: unused_active:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: v_mov_b32_e32 v0, 14
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: unused_active:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: v_mov_b32_e32 v0, 14
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: unused_active:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: v_mov_b32_e32 v0, 14
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ ret i32 14
+}
+
+; For any used VGPRs (including those used for SGPR spills), we need to restore the inactive lanes.
+; For CSR VGPRs, we need to restore all lanes.
+define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: csr:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x3
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; DAGISEL-NEXT: ;;#ASMSTART
+; DAGISEL-NEXT: ; clobber CSR
+; DAGISEL-NEXT: ;;#ASMEND
+; DAGISEL-NEXT: v_writelane_b32 v2, s20, 0
+; DAGISEL-NEXT: ;;#ASMSTART
+; DAGISEL-NEXT: ; clobber non-CSR
+; DAGISEL-NEXT: ;;#ASMEND
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; DAGISEL-NEXT: v_readlane_b32 s20, v2, 0
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x3
+; DAGISEL-NEXT: scratch_load_b32 v2, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8
+; DAGISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_wait_alu 0xf1ff
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: csr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: scratch_store_b32 off, v2, s32
+; GISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8
+; GISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; clobber CSR
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: v_writelane_b32 v2, s20, 0
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; clobber non-CSR
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; GISEL-NEXT: v_readlane_b32 s20, v2, 0
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: scratch_load_b32 v2, off, s32
+; GISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8
+; GISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_wait_alu 0xf1ff
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: csr:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x3
+; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8
+; DAGISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: ;;#ASMSTART
+; DAGISEL64-NEXT: ; clobber CSR
+; DAGISEL64-NEXT: ;;#ASMEND
+; DAGISEL64-NEXT: v_writelane_b32 v2, s20, 0
+; DAGISEL64-NEXT: ;;#ASMSTART
+; DAGISEL64-NEXT: ; clobber non-CSR
+; DAGISEL64-NEXT: ;;#ASMEND
+; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; DAGISEL64-NEXT: v_readlane_b32 s20, v2, 0
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x3
+; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8
+; DAGISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_wait_alu 0xf1ff
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: csr:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x3
+; GISEL64-NEXT: scratch_store_b32 off, v2, s32
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8
+; GISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; GISEL64-NEXT: ;;#ASMSTART
+; GISEL64-NEXT: ; clobber CSR
+; GISEL64-NEXT: ;;#ASMEND
+; GISEL64-NEXT: v_writelane_b32 v2, s20, 0
+; GISEL64-NEXT: ;;#ASMSTART
+; GISEL64-NEXT: ; clobber non-CSR
+; GISEL64-NEXT: ;;#ASMEND
+; GISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; GISEL64-NEXT: v_readlane_b32 s20, v2, 0
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x3
+; GISEL64-NEXT: scratch_load_b32 v2, off, s32
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8
+; GISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_wait_alu 0xf1ff
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %x = select i1 %active, i32 %a, i32 5
+ %y = select i1 %active, i32 %b, i32 3
+ call void asm sideeffect "; clobber CSR", "~{v40},~{s48}"()
+ call void asm sideeffect "; clobber non-CSR", "~{v49},~{s20}"()
+ %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i32 %ret
+}
+
+; Save and restore all lanes of v40.
+define amdgpu_gfx_whole_wave void @csr_vgpr_only(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: csr_vgpr_only:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT: ;;#ASMSTART
+; DAGISEL-NEXT: ; clobber CSR VGPR
+; DAGISEL-NEXT: ;;#ASMEND
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: csr_vgpr_only:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; clobber CSR VGPR
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: csr_vgpr_only:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: ;;#ASMSTART
+; DAGISEL64-NEXT: ; clobber CSR VGPR
+; DAGISEL64-NEXT: ;;#ASMEND
+; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: csr_vgpr_only:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT: ;;#ASMSTART
+; GISEL64-NEXT: ; clobber CSR VGPR
+; GISEL64-NEXT: ;;#ASMEND
+; GISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; clobber CSR VGPR", "~{v40}"()
+ ret void
+}
+
+define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: sgpr_spill_only:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: v_writelane_b32 v0, s68, 0
+; DAGISEL-NEXT: ;;#ASMSTART
+; DAGISEL-NEXT: ; clobber CSR SGPR
+; DAGISEL-NEXT: ;;#ASMEND
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_readlane_b32 s68, v0, 0
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sgpr_spill_only:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: v_writelane_b32 v0, s68, 0
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; clobber CSR SGPR
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_readlane_b32 s68, v0, 0
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: sgpr_spill_only:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: v_writelane_b32 v0, s68, 0
+; DAGISEL64-NEXT: ;;#ASMSTART
+; DAGISEL64-NEXT: ; clobber CSR SGPR
+; DAGISEL64-NEXT: ;;#ASMEND
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_readlane_b32 s68, v0, 0
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: sgpr_spill_only:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: v_writelane_b32 v0, s68, 0
+; GISEL64-NEXT: ;;#ASMSTART
+; GISEL64-NEXT: ; clobber CSR SGPR
+; GISEL64-NEXT: ;;#ASMEND
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_readlane_b32 s68, v0, 0
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; clobber CSR SGPR", "~{s68}"()
+ ret void
+}
+
+define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: multiple_blocks:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; DAGISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1
+; DAGISEL-NEXT: ; %bb.1: ; %if.then
+; DAGISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1
+; DAGISEL-NEXT: ; %bb.2: ; %if.end
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: multiple_blocks:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1
+; GISEL-NEXT: ; %bb.1: ; %if.then
+; GISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1
+; GISEL-NEXT: ; %bb.2: ; %if.end
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: multiple_blocks:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL64-NEXT: s_mov_b64 s[2:3], exec
+; DAGISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1
+; DAGISEL64-NEXT: ; %bb.1: ; %if.then
+; DAGISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1
+; DAGISEL64-NEXT: ; %bb.2: ; %if.end
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_or_b64 exec, exec, s[2:3]
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: multiple_blocks:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL64-NEXT: s_mov_b64 s[2:3], exec
+; GISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1
+; GISEL64-NEXT: ; %bb.1: ; %if.then
+; GISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1
+; GISEL64-NEXT: ; %bb.2: ; %if.end
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %c = icmp eq i32 %a, %b
+ br i1 %c, label %if.then, label %if.end
+
+if.then: ; preds = %0
+ %d = add i32 %a, %b
+ br label %if.end
+
+if.end:
+ %f = phi i32 [ %d, %if.then ], [ %b, %0 ]
+ %e = select i1 %active, i32 %a, i32 %f
+ ret i32 %e
+}
+
+define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
+; DAGISEL-LABEL: ret_64:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x3
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_dual_cndmask_b32 v1, 0, v1 :: v_dual_cndmask_b32 v0, 5, v0
+; DAGISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x3
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: ret_64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 0, v1
+; GISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: ret_64:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x3
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x3
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: ret_64:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x3
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x3
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %x = select i1 %active, i64 %a, i64 5
+ %y = select i1 %active, i64 %b, i64 3
+ %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i64 %ret
+}
+
+define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i32> inreg %v4i32, float inreg %float, ptr addrspace(5) inreg %ptr, ptr addrspace(5) inreg %ptr2) {
+; DAGISEL-LABEL: inreg_args:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT: s_clause 0x5
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s9
+; DAGISEL-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; DAGISEL-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; DAGISEL-NEXT: scratch_store_b32 off, v4, s10
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b128 off, v[0:3], s11
+; DAGISEL-NEXT: scratch_store_b32 off, v5, s11
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT: s_clause 0x5
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: inreg_args:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 s34, -1
+; GISEL-NEXT: s_clause 0x5
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_mov_b32 s0, s5
+; GISEL-NEXT: s_mov_b32 s1, s6
+; GISEL-NEXT: s_mov_b32 s2, s7
+; GISEL-NEXT: s_mov_b32 s3, s8
+; GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GISEL-NEXT: v_mov_b32_e32 v5, s9
+; GISEL-NEXT: scratch_store_b32 off, v4, s10
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b128 off, v[0:3], s11
+; GISEL-NEXT: scratch_store_b32 off, v5, s11
+; GISEL-NEXT: s_xor_b32 exec_lo, s34, -1
+; GISEL-NEXT: s_clause 0x5
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GISEL-NEXT: s_mov_b32 exec_lo, s34
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: inreg_args:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT: s_clause 0x5
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; DAGISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: v_mov_b32_e32 v4, s4
+; DAGISEL64-NEXT: v_mov_b32_e32 v0, s5
+; DAGISEL64-NEXT: v_mov_b32_e32 v1, s6
+; DAGISEL64-NEXT: v_mov_b32_e32 v2, s7
+; DAGISEL64-NEXT: v_mov_b32_e32 v3, s8
+; DAGISEL64-NEXT: v_mov_b32_e32 v5, s9
+; DAGISEL64-NEXT: scratch_store_b32 off, v4, s10
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_store_b128 off, v[0:3], s11
+; DAGISEL64-NEXT: scratch_store_b32 off, v5, s11
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT: s_clause 0x5
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; DAGISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: inreg_args:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 s[34:35], -1
+; GISEL64-NEXT: s_clause 0x5
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; GISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_mov_b32 s0, s5
+; GISEL64-NEXT: s_mov_b32 s1, s6
+; GISEL64-NEXT: s_mov_b32 s2, s7
+; GISEL64-NEXT: s_mov_b32 s3, s8
+; GISEL64-NEXT: v_mov_b32_e32 v4, s4
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_mov_b32_e32 v0, s0
+; GISEL64-NEXT: v_mov_b32_e32 v1, s1
+; GISEL64-NEXT: v_mov_b32_e32 v2, s2
+; GISEL64-NEXT: v_mov_b32_e32 v3, s3
+; GISEL64-NEXT: v_mov_b32_e32 v5, s9
+; GISEL64-NEXT: scratch_store_b32 off, v4, s10
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_store_b128 off, v[0:3], s11
+; GISEL64-NEXT: scratch_store_b32 off, v5, s11
+; GISEL64-NEXT: s_xor_b64 exec, s[34:35], -1
+; GISEL64-NEXT: s_clause 0x5
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; GISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GISEL64-NEXT: s_mov_b64 exec, s[34:35]
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ store i32 %i32, ptr addrspace(5) %ptr
+ store <4 x i32> %v4i32, ptr addrspace(5) %ptr2
+ store float %float, ptr addrspace(5) %ptr2
+ ret void
+}
+
+declare amdgpu_gfx <2 x half> @gfx_callee(<2 x half> %x, <2 x half> %y)
+
+define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 x half> %x, <2 x half> %y) {
+; DAGISEL-LABEL: call_gfx_from_whole_wave:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_mov_b32 s0, s33
+; DAGISEL-NEXT: s_mov_b32 s33, s32
+; DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; DAGISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; DAGISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; DAGISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; DAGISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; DAGISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; DAGISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; DAGISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; DAGISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; DAGISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; DAGISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; DAGISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; DAGISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; DAGISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; DAGISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; DAGISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; DAGISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; DAGISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; DAGISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; DAGISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; DAGISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; DAGISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; DAGISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; DAGISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; DAGISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; DAGISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; DAGISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; DAGISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; DAGISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; DAGISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; DAGISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; DAGISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; DAGISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; DAGISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; DAGISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; DAGISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; DAGISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; DAGISEL-NEXT: scratch_store_b32 off, v48, s33 offset:164
+; DAGISEL-NEXT: scratch_store_b32 off, v49, s33 offset:168
+; DAGISEL-NEXT: scratch_store_b32 off, v50, s33 offset:172
+; DAGISEL-NEXT: scratch_store_b32 off, v51, s33 offset:176
+; DAGISEL-NEXT: scratch_store_b32 off, v52, s33 offset:180
+; DAGISEL-NEXT: scratch_store_b32 off, v53, s33 offset:184
+; DAGISEL-NEXT: scratch_store_b32 off, v54, s33 offset:188
+; DAGISEL-NEXT: scratch_store_b32 off, v55, s33 offset:192
+; DAGISEL-NEXT: scratch_store_b32 off, v64, s33 offset:196
+; DAGISEL-NEXT: scratch_store_b32 off, v65, s33 offset:200
+; DAGISEL-NEXT: scratch_store_b32 off, v66, s33 offset:204
+; DAGISEL-NEXT: scratch_store_b32 off, v67, s33 offset:208
+; DAGISEL-NEXT: scratch_store_b32 off, v68, s33 offset:212
+; DAGISEL-NEXT: scratch_store_b32 off, v69, s33 offset:216
+; DAGISEL-NEXT: scratch_store_b32 off, v70, s33 offset:220
+; DAGISEL-NEXT: scratch_store_b32 off, v71, s33 offset:224
+; DAGISEL-NEXT: scratch_store_b32 off, v80, s33 offset:228
+; DAGISEL-NEXT: scratch_store_b32 off, v81, s33 offset:232
+; DAGISEL-NEXT: scratch_store_b32 off, v82, s33 offset:236
+; DAGISEL-NEXT: scratch_store_b32 off, v83, s33 offset:240
+; DAGISEL-NEXT: scratch_store_b32 off, v84, s33 offset:244
+; DAGISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248
+; DAGISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252
+; DAGISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260
+; DAGISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264
+; DAGISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268
+; DAGISEL-NEXT: scratch_store_b32 off, v99, s33 offset:272
+; DAGISEL-NEXT: scratch_store_b32 off, v100, s33 offset:276
+; DAGISEL-NEXT: scratch_store_b32 off, v101, s33 offset:280
+; DAGISEL-NEXT: scratch_store_b32 off, v102, s33 offset:284
+; DAGISEL-NEXT: scratch_store_b32 off, v103, s33 offset:288
+; DAGISEL-NEXT: scratch_store_b32 off, v112, s33 offset:292
+; DAGISEL-NEXT: scratch_store_b32 off, v113, s33 offset:296
+; DAGISEL-NEXT: scratch_store_b32 off, v114, s33 offset:300
+; DAGISEL-NEXT: scratch_store_b32 off, v115, s33 offset:304
+; DAGISEL-NEXT: scratch_store_b32 off, v116, s33 offset:308
+; DAGISEL-NEXT: scratch_store_b32 off, v117, s33 offset:312
+; DAGISEL-NEXT: scratch_store_b32 off, v118, s33 offset:316
+; DAGISEL-NEXT: scratch_store_b32 off, v119, s33 offset:320
+; DAGISEL-NEXT: scratch_store_b32 off, v128, s33 offset:324
+; DAGISEL-NEXT: scratch_store_b32 off, v129, s33 offset:328
+; DAGISEL-NEXT: scratch_store_b32 off, v130, s33 offset:332
+; DAGISEL-NEXT: scratch_store_b32 off, v131, s33 offset:336
+; DAGISEL-NEXT: scratch_store_b32 off, v132, s33 offset:340
+; DAGISEL-NEXT: scratch_store_b32 off, v133, s33 offset:344
+; DAGISEL-NEXT: scratch_store_b32 off, v134, s33 offset:348
+; DAGISEL-NEXT: scratch_store_b32 off, v135, s33 offset:352
+; DAGISEL-NEXT: scratch_store_b32 off, v144, s33 offset:356
+; DAGISEL-NEXT: scratch_store_b32 off, v145, s33 offset:360
+; DAGISEL-NEXT: scratch_store_b32 off, v146, s33 offset:364
+; DAGISEL-NEXT: scratch_store_b32 off, v147, s33 offset:368
+; DAGISEL-NEXT: scratch_store_b32 off, v148, s33 offset:372
+; DAGISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376
+; DAGISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380
+; DAGISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388
+; DAGISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392
+; DAGISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396
+; DAGISEL-NEXT: scratch_store_b32 off, v163, s33 offset:400
+; DAGISEL-NEXT: scratch_store_b32 off, v164, s33 offset:404
+; DAGISEL-NEXT: scratch_store_b32 off, v165, s33 offset:408
+; DAGISEL-NEXT: scratch_store_b32 off, v166, s33 offset:412
+; DAGISEL-NEXT: scratch_store_b32 off, v167, s33 offset:416
+; DAGISEL-NEXT: scratch_store_b32 off, v176, s33 offset:420
+; DAGISEL-NEXT: scratch_store_b32 off, v177, s33 offset:424
+; DAGISEL-NEXT: scratch_store_b32 off, v178, s33 offset:428
+; DAGISEL-NEXT: scratch_store_b32 off, v179, s33 offset:432
+; DAGISEL-NEXT: scratch_store_b32 off, v180, s33 offset:436
+; DAGISEL-NEXT: scratch_store_b32 off, v181, s33 offset:440
+; DAGISEL-NEXT: scratch_store_b32 off, v182, s33 offset:444
+; DAGISEL-NEXT: scratch_store_b32 off, v183, s33 offset:448
+; DAGISEL-NEXT: scratch_store_b32 off, v192, s33 offset:452
+; DAGISEL-NEXT: scratch_store_b32 off, v193, s33 offset:456
+; DAGISEL-NEXT: scratch_store_b32 off, v194, s33 offset:460
+; DAGISEL-NEXT: scratch_store_b32 off, v195, s33 offset:464
+; DAGISEL-NEXT: scratch_store_b32 off, v196, s33 offset:468
+; DAGISEL-NEXT: scratch_store_b32 off, v197, s33 offset:472
+; DAGISEL-NEXT: scratch_store_b32 off, v198, s33 offset:476
+; DAGISEL-NEXT: scratch_store_b32 off, v199, s33 offset:480
+; DAGISEL-NEXT: scratch_store_b32 off, v208, s33 offset:484
+; DAGISEL-NEXT: scratch_store_b32 off, v209, s33 offset:488
+; DAGISEL-NEXT: scratch_store_b32 off, v210, s33 offset:492
+; DAGISEL-NEXT: scratch_store_b32 off, v211, s33 offset:496
+; DAGISEL-NEXT: scratch_store_b32 off, v212, s33 offset:500
+; DAGISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504
+; DAGISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508
+; DAGISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512
+; DAGISEL-NEXT: s_clause 0xf
+; DAGISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516
+; DAGISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520
+; DAGISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524
+; DAGISEL-NEXT: scratch_store_b32 off, v227, s33 offset:528
+; DAGISEL-NEXT: scratch_store_b32 off, v228, s33 offset:532
+; DAGISEL-NEXT: scratch_store_b32 off, v229, s33 offset:536
+; DAGISEL-NEXT: scratch_store_b32 off, v230, s33 offset:540
+; DAGISEL-NEXT: scratch_store_b32 off, v231, s33 offset:544
+; DAGISEL-NEXT: scratch_store_b32 off, v240, s33 offset:548
+; DAGISEL-NEXT: scratch_store_b32 off, v241, s33 offset:552
+; DAGISEL-NEXT: scratch_store_b32 off, v242, s33 offset:556
+; DAGISEL-NEXT: scratch_store_b32 off, v243, s33 offset:560
+; DAGISEL-NEXT: scratch_store_b32 off, v244, s33 offset:564
+; DAGISEL-NEXT: scratch_store_b32 off, v245, s33 offset:568
+; DAGISEL-NEXT: scratch_store_b32 off, v246, s33 offset:572
+; DAGISEL-NEXT: scratch_store_b32 off, v247, s33 offset:576
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_writelane_b32 v40, s0, 3
+; DAGISEL-NEXT: v_mov_b32_e32 v2, v0
+; DAGISEL-NEXT: v_swap_b32 v0, v1
+; DAGISEL-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi
+; DAGISEL-NEXT: v_writelane_b32 v40, s4, 0
+; DAGISEL-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo
+; DAGISEL-NEXT: s_addk_co_i32 s32, 0x250
+; DAGISEL-NEXT: v_writelane_b32 v40, s30, 1
+; DAGISEL-NEXT: v_writelane_b32 v40, s31, 2
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_readlane_b32 s31, v40, 2
+; DAGISEL-NEXT: v_readlane_b32 s30, v40, 1
+; DAGISEL-NEXT: v_readlane_b32 s4, v40, 0
+; DAGISEL-NEXT: v_readlane_b32 s0, v40, 3
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_mov_b32 s32, s33
+; DAGISEL-NEXT: s_xor_b32 exec_lo, s4, -1
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; DAGISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; DAGISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; DAGISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; DAGISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; DAGISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; DAGISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; DAGISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; DAGISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; DAGISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; DAGISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; DAGISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; DAGISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; DAGISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; DAGISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; DAGISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; DAGISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; DAGISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; DAGISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; DAGISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; DAGISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; DAGISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; DAGISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; DAGISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; DAGISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; DAGISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; DAGISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; DAGISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; DAGISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; DAGISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; DAGISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; DAGISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; DAGISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; DAGISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; DAGISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; DAGISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; DAGISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; DAGISEL-NEXT: scratch_load_b32 v48, off, s33 offset:164
+; DAGISEL-NEXT: scratch_load_b32 v49, off, s33 offset:168
+; DAGISEL-NEXT: scratch_load_b32 v50, off, s33 offset:172
+; DAGISEL-NEXT: scratch_load_b32 v51, off, s33 offset:176
+; DAGISEL-NEXT: scratch_load_b32 v52, off, s33 offset:180
+; DAGISEL-NEXT: scratch_load_b32 v53, off, s33 offset:184
+; DAGISEL-NEXT: scratch_load_b32 v54, off, s33 offset:188
+; DAGISEL-NEXT: scratch_load_b32 v55, off, s33 offset:192
+; DAGISEL-NEXT: scratch_load_b32 v64, off, s33 offset:196
+; DAGISEL-NEXT: scratch_load_b32 v65, off, s33 offset:200
+; DAGISEL-NEXT: scratch_load_b32 v66, off, s33 offset:204
+; DAGISEL-NEXT: scratch_load_b32 v67, off, s33 offset:208
+; DAGISEL-NEXT: scratch_load_b32 v68, off, s33 offset:212
+; DAGISEL-NEXT: scratch_load_b32 v69, off, s33 offset:216
+; DAGISEL-NEXT: scratch_load_b32 v70, off, s33 offset:220
+; DAGISEL-NEXT: scratch_load_b32 v71, off, s33 offset:224
+; DAGISEL-NEXT: scratch_load_b32 v80, off, s33 offset:228
+; DAGISEL-NEXT: scratch_load_b32 v81, off, s33 offset:232
+; DAGISEL-NEXT: scratch_load_b32 v82, off, s33 offset:236
+; DAGISEL-NEXT: scratch_load_b32 v83, off, s33 offset:240
+; DAGISEL-NEXT: scratch_load_b32 v84, off, s33 offset:244
+; DAGISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248
+; DAGISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252
+; DAGISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260
+; DAGISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264
+; DAGISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268
+; DAGISEL-NEXT: scratch_load_b32 v99, off, s33 offset:272
+; DAGISEL-NEXT: scratch_load_b32 v100, off, s33 offset:276
+; DAGISEL-NEXT: scratch_load_b32 v101, off, s33 offset:280
+; DAGISEL-NEXT: scratch_load_b32 v102, off, s33 offset:284
+; DAGISEL-NEXT: scratch_load_b32 v103, off, s33 offset:288
+; DAGISEL-NEXT: scratch_load_b32 v112, off, s33 offset:292
+; DAGISEL-NEXT: scratch_load_b32 v113, off, s33 offset:296
+; DAGISEL-NEXT: scratch_load_b32 v114, off, s33 offset:300
+; DAGISEL-NEXT: scratch_load_b32 v115, off, s33 offset:304
+; DAGISEL-NEXT: scratch_load_b32 v116, off, s33 offset:308
+; DAGISEL-NEXT: scratch_load_b32 v117, off, s33 offset:312
+; DAGISEL-NEXT: scratch_load_b32 v118, off, s33 offset:316
+; DAGISEL-NEXT: scratch_load_b32 v119, off, s33 offset:320
+; DAGISEL-NEXT: scratch_load_b32 v128, off, s33 offset:324
+; DAGISEL-NEXT: scratch_load_b32 v129, off, s33 offset:328
+; DAGISEL-NEXT: scratch_load_b32 v130, off, s33 offset:332
+; DAGISEL-NEXT: scratch_load_b32 v131, off, s33 offset:336
+; DAGISEL-NEXT: scratch_load_b32 v132, off, s33 offset:340
+; DAGISEL-NEXT: scratch_load_b32 v133, off, s33 offset:344
+; DAGISEL-NEXT: scratch_load_b32 v134, off, s33 offset:348
+; DAGISEL-NEXT: scratch_load_b32 v135, off, s33 offset:352
+; DAGISEL-NEXT: scratch_load_b32 v144, off, s33 offset:356
+; DAGISEL-NEXT: scratch_load_b32 v145, off, s33 offset:360
+; DAGISEL-NEXT: scratch_load_b32 v146, off, s33 offset:364
+; DAGISEL-NEXT: scratch_load_b32 v147, off, s33 offset:368
+; DAGISEL-NEXT: scratch_load_b32 v148, off, s33 offset:372
+; DAGISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376
+; DAGISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380
+; DAGISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388
+; DAGISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392
+; DAGISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396
+; DAGISEL-NEXT: scratch_load_b32 v163, off, s33 offset:400
+; DAGISEL-NEXT: scratch_load_b32 v164, off, s33 offset:404
+; DAGISEL-NEXT: scratch_load_b32 v165, off, s33 offset:408
+; DAGISEL-NEXT: scratch_load_b32 v166, off, s33 offset:412
+; DAGISEL-NEXT: scratch_load_b32 v167, off, s33 offset:416
+; DAGISEL-NEXT: scratch_load_b32 v176, off, s33 offset:420
+; DAGISEL-NEXT: scratch_load_b32 v177, off, s33 offset:424
+; DAGISEL-NEXT: scratch_load_b32 v178, off, s33 offset:428
+; DAGISEL-NEXT: scratch_load_b32 v179, off, s33 offset:432
+; DAGISEL-NEXT: scratch_load_b32 v180, off, s33 offset:436
+; DAGISEL-NEXT: scratch_load_b32 v181, off, s33 offset:440
+; DAGISEL-NEXT: scratch_load_b32 v182, off, s33 offset:444
+; DAGISEL-NEXT: scratch_load_b32 v183, off, s33 offset:448
+; DAGISEL-NEXT: scratch_load_b32 v192, off, s33 offset:452
+; DAGISEL-NEXT: scratch_load_b32 v193, off, s33 offset:456
+; DAGISEL-NEXT: scratch_load_b32 v194, off, s33 offset:460
+; DAGISEL-NEXT: scratch_load_b32 v195, off, s33 offset:464
+; DAGISEL-NEXT: scratch_load_b32 v196, off, s33 offset:468
+; DAGISEL-NEXT: scratch_load_b32 v197, off, s33 offset:472
+; DAGISEL-NEXT: scratch_load_b32 v198, off, s33 offset:476
+; DAGISEL-NEXT: scratch_load_b32 v199, off, s33 offset:480
+; DAGISEL-NEXT: scratch_load_b32 v208, off, s33 offset:484
+; DAGISEL-NEXT: scratch_load_b32 v209, off, s33 offset:488
+; DAGISEL-NEXT: scratch_load_b32 v210, off, s33 offset:492
+; DAGISEL-NEXT: scratch_load_b32 v211, off, s33 offset:496
+; DAGISEL-NEXT: scratch_load_b32 v212, off, s33 offset:500
+; DAGISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504
+; DAGISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508
+; DAGISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512
+; DAGISEL-NEXT: s_clause 0xf
+; DAGISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516
+; DAGISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520
+; DAGISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524
+; DAGISEL-NEXT: scratch_load_b32 v227, off, s33 offset:528
+; DAGISEL-NEXT: scratch_load_b32 v228, off, s33 offset:532
+; DAGISEL-NEXT: scratch_load_b32 v229, off, s33 offset:536
+; DAGISEL-NEXT: scratch_load_b32 v230, off, s33 offset:540
+; DAGISEL-NEXT: scratch_load_b32 v231, off, s33 offset:544
+; DAGISEL-NEXT: scratch_load_b32 v240, off, s33 offset:548
+; DAGISEL-NEXT: scratch_load_b32 v241, off, s33 offset:552
+; DAGISEL-NEXT: scratch_load_b32 v242, off, s33 offset:556
+; DAGISEL-NEXT: scratch_load_b32 v243, off, s33 offset:560
+; DAGISEL-NEXT: scratch_load_b32 v244, off, s33 offset:564
+; DAGISEL-NEXT: scratch_load_b32 v245, off, s33 offset:568
+; DAGISEL-NEXT: scratch_load_b32 v246, off, s33 offset:572
+; DAGISEL-NEXT: scratch_load_b32 v247, off, s33 offset:576
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; DAGISEL-NEXT: s_mov_b32 s33, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: call_gfx_from_whole_wave:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_mov_b32 s0, s33
+; GISEL-NEXT: s_mov_b32 s33, s32
+; GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; GISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; GISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; GISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; GISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; GISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; GISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; GISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; GISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; GISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; GISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; GISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; GISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; GISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; GISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; GISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; GISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; GISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; GISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; GISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; GISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; GISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; GISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; GISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; GISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; GISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; GISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; GISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; GISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; GISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; GISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; GISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; GISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; GISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; GISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; GISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; GISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; GISEL-NEXT: scratch_store_b32 off, v48, s33 offset:164
+; GISEL-NEXT: scratch_store_b32 off, v49, s33 offset:168
+; GISEL-NEXT: scratch_store_b32 off, v50, s33 offset:172
+; GISEL-NEXT: scratch_store_b32 off, v51, s33 offset:176
+; GISEL-NEXT: scratch_store_b32 off, v52, s33 offset:180
+; GISEL-NEXT: scratch_store_b32 off, v53, s33 offset:184
+; GISEL-NEXT: scratch_store_b32 off, v54, s33 offset:188
+; GISEL-NEXT: scratch_store_b32 off, v55, s33 offset:192
+; GISEL-NEXT: scratch_store_b32 off, v64, s33 offset:196
+; GISEL-NEXT: scratch_store_b32 off, v65, s33 offset:200
+; GISEL-NEXT: scratch_store_b32 off, v66, s33 offset:204
+; GISEL-NEXT: scratch_store_b32 off, v67, s33 offset:208
+; GISEL-NEXT: scratch_store_b32 off, v68, s33 offset:212
+; GISEL-NEXT: scratch_store_b32 off, v69, s33 offset:216
+; GISEL-NEXT: scratch_store_b32 off, v70, s33 offset:220
+; GISEL-NEXT: scratch_store_b32 off, v71, s33 offset:224
+; GISEL-NEXT: scratch_store_b32 off, v80, s33 offset:228
+; GISEL-NEXT: scratch_store_b32 off, v81, s33 offset:232
+; GISEL-NEXT: scratch_store_b32 off, v82, s33 offset:236
+; GISEL-NEXT: scratch_store_b32 off, v83, s33 offset:240
+; GISEL-NEXT: scratch_store_b32 off, v84, s33 offset:244
+; GISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248
+; GISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252
+; GISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260
+; GISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264
+; GISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268
+; GISEL-NEXT: scratch_store_b32 off, v99, s33 offset:272
+; GISEL-NEXT: scratch_store_b32 off, v100, s33 offset:276
+; GISEL-NEXT: scratch_store_b32 off, v101, s33 offset:280
+; GISEL-NEXT: scratch_store_b32 off, v102, s33 offset:284
+; GISEL-NEXT: scratch_store_b32 off, v103, s33 offset:288
+; GISEL-NEXT: scratch_store_b32 off, v112, s33 offset:292
+; GISEL-NEXT: scratch_store_b32 off, v113, s33 offset:296
+; GISEL-NEXT: scratch_store_b32 off, v114, s33 offset:300
+; GISEL-NEXT: scratch_store_b32 off, v115, s33 offset:304
+; GISEL-NEXT: scratch_store_b32 off, v116, s33 offset:308
+; GISEL-NEXT: scratch_store_b32 off, v117, s33 offset:312
+; GISEL-NEXT: scratch_store_b32 off, v118, s33 offset:316
+; GISEL-NEXT: scratch_store_b32 off, v119, s33 offset:320
+; GISEL-NEXT: scratch_store_b32 off, v128, s33 offset:324
+; GISEL-NEXT: scratch_store_b32 off, v129, s33 offset:328
+; GISEL-NEXT: scratch_store_b32 off, v130, s33 offset:332
+; GISEL-NEXT: scratch_store_b32 off, v131, s33 offset:336
+; GISEL-NEXT: scratch_store_b32 off, v132, s33 offset:340
+; GISEL-NEXT: scratch_store_b32 off, v133, s33 offset:344
+; GISEL-NEXT: scratch_store_b32 off, v134, s33 offset:348
+; GISEL-NEXT: scratch_store_b32 off, v135, s33 offset:352
+; GISEL-NEXT: scratch_store_b32 off, v144, s33 offset:356
+; GISEL-NEXT: scratch_store_b32 off, v145, s33 offset:360
+; GISEL-NEXT: scratch_store_b32 off, v146, s33 offset:364
+; GISEL-NEXT: scratch_store_b32 off, v147, s33 offset:368
+; GISEL-NEXT: scratch_store_b32 off, v148, s33 offset:372
+; GISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376
+; GISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380
+; GISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388
+; GISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392
+; GISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396
+; GISEL-NEXT: scratch_store_b32 off, v163, s33 offset:400
+; GISEL-NEXT: scratch_store_b32 off, v164, s33 offset:404
+; GISEL-NEXT: scratch_store_b32 off, v165, s33 offset:408
+; GISEL-NEXT: scratch_store_b32 off, v166, s33 offset:412
+; GISEL-NEXT: scratch_store_b32 off, v167, s33 offset:416
+; GISEL-NEXT: scratch_store_b32 off, v176, s33 offset:420
+; GISEL-NEXT: scratch_store_b32 off, v177, s33 offset:424
+; GISEL-NEXT: scratch_store_b32 off, v178, s33 offset:428
+; GISEL-NEXT: scratch_store_b32 off, v179, s33 offset:432
+; GISEL-NEXT: scratch_store_b32 off, v180, s33 offset:436
+; GISEL-NEXT: scratch_store_b32 off, v181, s33 offset:440
+; GISEL-NEXT: scratch_store_b32 off, v182, s33 offset:444
+; GISEL-NEXT: scratch_store_b32 off, v183, s33 offset:448
+; GISEL-NEXT: scratch_store_b32 off, v192, s33 offset:452
+; GISEL-NEXT: scratch_store_b32 off, v193, s33 offset:456
+; GISEL-NEXT: scratch_store_b32 off, v194, s33 offset:460
+; GISEL-NEXT: scratch_store_b32 off, v195, s33 offset:464
+; GISEL-NEXT: scratch_store_b32 off, v196, s33 offset:468
+; GISEL-NEXT: scratch_store_b32 off, v197, s33 offset:472
+; GISEL-NEXT: scratch_store_b32 off, v198, s33 offset:476
+; GISEL-NEXT: scratch_store_b32 off, v199, s33 offset:480
+; GISEL-NEXT: scratch_store_b32 off, v208, s33 offset:484
+; GISEL-NEXT: scratch_store_b32 off, v209, s33 offset:488
+; GISEL-NEXT: scratch_store_b32 off, v210, s33 offset:492
+; GISEL-NEXT: scratch_store_b32 off, v211, s33 offset:496
+; GISEL-NEXT: scratch_store_b32 off, v212, s33 offset:500
+; GISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504
+; GISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508
+; GISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512
+; GISEL-NEXT: s_clause 0xf
+; GISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516
+; GISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520
+; GISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524
+; GISEL-NEXT: scratch_store_b32 off, v227, s33 offset:528
+; GISEL-NEXT: scratch_store_b32 off, v228, s33 offset:532
+; GISEL-NEXT: scratch_store_b32 off, v229, s33 offset:536
+; GISEL-NEXT: scratch_store_b32 off, v230, s33 offset:540
+; GISEL-NEXT: scratch_store_b32 off, v231, s33 offset:544
+; GISEL-NEXT: scratch_store_b32 off, v240, s33 offset:548
+; GISEL-NEXT: scratch_store_b32 off, v241, s33 offset:552
+; GISEL-NEXT: scratch_store_b32 off, v242, s33 offset:556
+; GISEL-NEXT: scratch_store_b32 off, v243, s33 offset:560
+; GISEL-NEXT: scratch_store_b32 off, v244, s33 offset:564
+; GISEL-NEXT: scratch_store_b32 off, v245, s33 offset:568
+; GISEL-NEXT: scratch_store_b32 off, v246, s33 offset:572
+; GISEL-NEXT: scratch_store_b32 off, v247, s33 offset:576
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_writelane_b32 v40, s0, 3
+; GISEL-NEXT: v_mov_b32_e32 v2, v0
+; GISEL-NEXT: v_swap_b32 v0, v1
+; GISEL-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo
+; GISEL-NEXT: v_writelane_b32 v40, s4, 0
+; GISEL-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi
+; GISEL-NEXT: s_addk_co_i32 s32, 0x250
+; GISEL-NEXT: v_writelane_b32 v40, s30, 1
+; GISEL-NEXT: v_writelane_b32 v40, s31, 2
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_readlane_b32 s31, v40, 2
+; GISEL-NEXT: v_readlane_b32 s30, v40, 1
+; GISEL-NEXT: v_readlane_b32 s4, v40, 0
+; GISEL-NEXT: v_readlane_b32 s0, v40, 3
+; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GISEL-NEXT: s_mov_b32 s32, s33
+; GISEL-NEXT: s_xor_b32 exec_lo, s4, -1
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; GISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; GISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; GISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; GISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; GISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; GISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; GISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; GISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; GISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; GISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; GISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; GISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; GISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; GISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; GISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; GISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; GISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; GISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; GISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; GISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; GISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; GISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; GISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; GISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; GISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; GISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; GISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; GISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; GISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; GISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; GISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; GISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; GISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; GISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; GISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; GISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; GISEL-NEXT: scratch_load_b32 v48, off, s33 offset:164
+; GISEL-NEXT: scratch_load_b32 v49, off, s33 offset:168
+; GISEL-NEXT: scratch_load_b32 v50, off, s33 offset:172
+; GISEL-NEXT: scratch_load_b32 v51, off, s33 offset:176
+; GISEL-NEXT: scratch_load_b32 v52, off, s33 offset:180
+; GISEL-NEXT: scratch_load_b32 v53, off, s33 offset:184
+; GISEL-NEXT: scratch_load_b32 v54, off, s33 offset:188
+; GISEL-NEXT: scratch_load_b32 v55, off, s33 offset:192
+; GISEL-NEXT: scratch_load_b32 v64, off, s33 offset:196
+; GISEL-NEXT: scratch_load_b32 v65, off, s33 offset:200
+; GISEL-NEXT: scratch_load_b32 v66, off, s33 offset:204
+; GISEL-NEXT: scratch_load_b32 v67, off, s33 offset:208
+; GISEL-NEXT: scratch_load_b32 v68, off, s33 offset:212
+; GISEL-NEXT: scratch_load_b32 v69, off, s33 offset:216
+; GISEL-NEXT: scratch_load_b32 v70, off, s33 offset:220
+; GISEL-NEXT: scratch_load_b32 v71, off, s33 offset:224
+; GISEL-NEXT: scratch_load_b32 v80, off, s33 offset:228
+; GISEL-NEXT: scratch_load_b32 v81, off, s33 offset:232
+; GISEL-NEXT: scratch_load_b32 v82, off, s33 offset:236
+; GISEL-NEXT: scratch_load_b32 v83, off, s33 offset:240
+; GISEL-NEXT: scratch_load_b32 v84, off, s33 offset:244
+; GISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248
+; GISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252
+; GISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260
+; GISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264
+; GISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268
+; GISEL-NEXT: scratch_load_b32 v99, off, s33 offset:272
+; GISEL-NEXT: scratch_load_b32 v100, off, s33 offset:276
+; GISEL-NEXT: scratch_load_b32 v101, off, s33 offset:280
+; GISEL-NEXT: scratch_load_b32 v102, off, s33 offset:284
+; GISEL-NEXT: scratch_load_b32 v103, off, s33 offset:288
+; GISEL-NEXT: scratch_load_b32 v112, off, s33 offset:292
+; GISEL-NEXT: scratch_load_b32 v113, off, s33 offset:296
+; GISEL-NEXT: scratch_load_b32 v114, off, s33 offset:300
+; GISEL-NEXT: scratch_load_b32 v115, off, s33 offset:304
+; GISEL-NEXT: scratch_load_b32 v116, off, s33 offset:308
+; GISEL-NEXT: scratch_load_b32 v117, off, s33 offset:312
+; GISEL-NEXT: scratch_load_b32 v118, off, s33 offset:316
+; GISEL-NEXT: scratch_load_b32 v119, off, s33 offset:320
+; GISEL-NEXT: scratch_load_b32 v128, off, s33 offset:324
+; GISEL-NEXT: scratch_load_b32 v129, off, s33 offset:328
+; GISEL-NEXT: scratch_load_b32 v130, off, s33 offset:332
+; GISEL-NEXT: scratch_load_b32 v131, off, s33 offset:336
+; GISEL-NEXT: scratch_load_b32 v132, off, s33 offset:340
+; GISEL-NEXT: scratch_load_b32 v133, off, s33 offset:344
+; GISEL-NEXT: scratch_load_b32 v134, off, s33 offset:348
+; GISEL-NEXT: scratch_load_b32 v135, off, s33 offset:352
+; GISEL-NEXT: scratch_load_b32 v144, off, s33 offset:356
+; GISEL-NEXT: scratch_load_b32 v145, off, s33 offset:360
+; GISEL-NEXT: scratch_load_b32 v146, off, s33 offset:364
+; GISEL-NEXT: scratch_load_b32 v147, off, s33 offset:368
+; GISEL-NEXT: scratch_load_b32 v148, off, s33 offset:372
+; GISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376
+; GISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380
+; GISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388
+; GISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392
+; GISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396
+; GISEL-NEXT: scratch_load_b32 v163, off, s33 offset:400
+; GISEL-NEXT: scratch_load_b32 v164, off, s33 offset:404
+; GISEL-NEXT: scratch_load_b32 v165, off, s33 offset:408
+; GISEL-NEXT: scratch_load_b32 v166, off, s33 offset:412
+; GISEL-NEXT: scratch_load_b32 v167, off, s33 offset:416
+; GISEL-NEXT: scratch_load_b32 v176, off, s33 offset:420
+; GISEL-NEXT: scratch_load_b32 v177, off, s33 offset:424
+; GISEL-NEXT: scratch_load_b32 v178, off, s33 offset:428
+; GISEL-NEXT: scratch_load_b32 v179, off, s33 offset:432
+; GISEL-NEXT: scratch_load_b32 v180, off, s33 offset:436
+; GISEL-NEXT: scratch_load_b32 v181, off, s33 offset:440
+; GISEL-NEXT: scratch_load_b32 v182, off, s33 offset:444
+; GISEL-NEXT: scratch_load_b32 v183, off, s33 offset:448
+; GISEL-NEXT: scratch_load_b32 v192, off, s33 offset:452
+; GISEL-NEXT: scratch_load_b32 v193, off, s33 offset:456
+; GISEL-NEXT: scratch_load_b32 v194, off, s33 offset:460
+; GISEL-NEXT: scratch_load_b32 v195, off, s33 offset:464
+; GISEL-NEXT: scratch_load_b32 v196, off, s33 offset:468
+; GISEL-NEXT: scratch_load_b32 v197, off, s33 offset:472
+; GISEL-NEXT: scratch_load_b32 v198, off, s33 offset:476
+; GISEL-NEXT: scratch_load_b32 v199, off, s33 offset:480
+; GISEL-NEXT: scratch_load_b32 v208, off, s33 offset:484
+; GISEL-NEXT: scratch_load_b32 v209, off, s33 offset:488
+; GISEL-NEXT: scratch_load_b32 v210, off, s33 offset:492
+; GISEL-NEXT: scratch_load_b32 v211, off, s33 offset:496
+; GISEL-NEXT: scratch_load_b32 v212, off, s33 offset:500
+; GISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504
+; GISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508
+; GISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512
+; GISEL-NEXT: s_clause 0xf
+; GISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516
+; GISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520
+; GISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524
+; GISEL-NEXT: scratch_load_b32 v227, off, s33 offset:528
+; GISEL-NEXT: scratch_load_b32 v228, off, s33 offset:532
+; GISEL-NEXT: scratch_load_b32 v229, off, s33 offset:536
+; GISEL-NEXT: scratch_load_b32 v230, off, s33 offset:540
+; GISEL-NEXT: scratch_load_b32 v231, off, s33 offset:544
+; GISEL-NEXT: scratch_load_b32 v240, off, s33 offset:548
+; GISEL-NEXT: scratch_load_b32 v241, off, s33 offset:552
+; GISEL-NEXT: scratch_load_b32 v242, off, s33 offset:556
+; GISEL-NEXT: scratch_load_b32 v243, off, s33 offset:560
+; GISEL-NEXT: scratch_load_b32 v244, off, s33 offset:564
+; GISEL-NEXT: scratch_load_b32 v245, off, s33 offset:568
+; GISEL-NEXT: scratch_load_b32 v246, off, s33 offset:572
+; GISEL-NEXT: scratch_load_b32 v247, off, s33 offset:576
+; GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GISEL-NEXT: s_mov_b32 s33, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: call_gfx_from_whole_wave:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_mov_b32 s0, s33
+; DAGISEL64-NEXT: s_mov_b32 s33, s32
+; DAGISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; DAGISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; DAGISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; DAGISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; DAGISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; DAGISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; DAGISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; DAGISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; DAGISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; DAGISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; DAGISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; DAGISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; DAGISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; DAGISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; DAGISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; DAGISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; DAGISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; DAGISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; DAGISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; DAGISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; DAGISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; DAGISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; DAGISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; DAGISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; DAGISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; DAGISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; DAGISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; DAGISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; DAGISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; DAGISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; DAGISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; DAGISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; DAGISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; DAGISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; DAGISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; DAGISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; DAGISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; DAGISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; DAGISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:164
+; DAGISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:168
+; DAGISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:172
+; DAGISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:176
+; DAGISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:180
+; DAGISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:184
+; DAGISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:188
+; DAGISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:192
+; DAGISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:196
+; DAGISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:200
+; DAGISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:204
+; DAGISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:208
+; DAGISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:212
+; DAGISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:216
+; DAGISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:220
+; DAGISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:224
+; DAGISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:228
+; DAGISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:232
+; DAGISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:236
+; DAGISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:240
+; DAGISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:244
+; DAGISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248
+; DAGISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252
+; DAGISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260
+; DAGISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264
+; DAGISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268
+; DAGISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:272
+; DAGISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:276
+; DAGISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:280
+; DAGISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:284
+; DAGISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:288
+; DAGISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:292
+; DAGISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:296
+; DAGISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:300
+; DAGISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:304
+; DAGISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:308
+; DAGISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:312
+; DAGISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:316
+; DAGISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:320
+; DAGISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:324
+; DAGISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:328
+; DAGISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:332
+; DAGISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:336
+; DAGISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:340
+; DAGISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:344
+; DAGISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:348
+; DAGISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:352
+; DAGISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:356
+; DAGISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:360
+; DAGISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:364
+; DAGISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:368
+; DAGISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:372
+; DAGISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376
+; DAGISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380
+; DAGISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388
+; DAGISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392
+; DAGISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396
+; DAGISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:400
+; DAGISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:404
+; DAGISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:408
+; DAGISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:412
+; DAGISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:416
+; DAGISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:420
+; DAGISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:424
+; DAGISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:428
+; DAGISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:432
+; DAGISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:436
+; DAGISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:440
+; DAGISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:444
+; DAGISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:448
+; DAGISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:452
+; DAGISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:456
+; DAGISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:460
+; DAGISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:464
+; DAGISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:468
+; DAGISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:472
+; DAGISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:476
+; DAGISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:480
+; DAGISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:484
+; DAGISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:488
+; DAGISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:492
+; DAGISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:496
+; DAGISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:500
+; DAGISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504
+; DAGISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508
+; DAGISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512
+; DAGISEL64-NEXT: s_clause 0xf
+; DAGISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516
+; DAGISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520
+; DAGISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524
+; DAGISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:528
+; DAGISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:532
+; DAGISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:536
+; DAGISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:540
+; DAGISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:544
+; DAGISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:548
+; DAGISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:552
+; DAGISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:556
+; DAGISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:560
+; DAGISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:564
+; DAGISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:568
+; DAGISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:572
+; DAGISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:576
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_writelane_b32 v40, s0, 4
+; DAGISEL64-NEXT: v_mov_b32_e32 v2, v0
+; DAGISEL64-NEXT: v_swap_b32 v0, v1
+; DAGISEL64-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi
+; DAGISEL64-NEXT: v_writelane_b32 v40, s4, 0
+; DAGISEL64-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo
+; DAGISEL64-NEXT: s_addk_co_i32 s32, 0x250
+; DAGISEL64-NEXT: v_writelane_b32 v40, s5, 1
+; DAGISEL64-NEXT: v_writelane_b32 v40, s30, 2
+; DAGISEL64-NEXT: v_writelane_b32 v40, s31, 3
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_readlane_b32 s31, v40, 3
+; DAGISEL64-NEXT: v_readlane_b32 s30, v40, 2
+; DAGISEL64-NEXT: v_readlane_b32 s5, v40, 1
+; DAGISEL64-NEXT: v_readlane_b32 s4, v40, 0
+; DAGISEL64-NEXT: v_readlane_b32 s0, v40, 4
+; DAGISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_mov_b32 s32, s33
+; DAGISEL64-NEXT: s_xor_b64 exec, s[4:5], -1
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; DAGISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; DAGISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; DAGISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; DAGISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; DAGISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; DAGISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; DAGISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; DAGISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; DAGISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; DAGISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; DAGISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; DAGISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; DAGISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; DAGISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; DAGISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; DAGISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; DAGISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; DAGISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; DAGISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; DAGISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; DAGISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; DAGISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; DAGISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; DAGISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; DAGISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; DAGISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; DAGISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; DAGISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; DAGISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; DAGISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; DAGISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; DAGISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; DAGISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; DAGISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; DAGISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; DAGISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; DAGISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; DAGISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:164
+; DAGISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:168
+; DAGISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:172
+; DAGISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:176
+; DAGISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:180
+; DAGISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:184
+; DAGISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:188
+; DAGISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:192
+; DAGISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:196
+; DAGISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:200
+; DAGISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:204
+; DAGISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:208
+; DAGISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:212
+; DAGISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:216
+; DAGISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:220
+; DAGISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:224
+; DAGISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:228
+; DAGISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:232
+; DAGISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:236
+; DAGISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:240
+; DAGISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:244
+; DAGISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248
+; DAGISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252
+; DAGISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260
+; DAGISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264
+; DAGISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268
+; DAGISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:272
+; DAGISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:276
+; DAGISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:280
+; DAGISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:284
+; DAGISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:288
+; DAGISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:292
+; DAGISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:296
+; DAGISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:300
+; DAGISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:304
+; DAGISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:308
+; DAGISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:312
+; DAGISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:316
+; DAGISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:320
+; DAGISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:324
+; DAGISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:328
+; DAGISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:332
+; DAGISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:336
+; DAGISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:340
+; DAGISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:344
+; DAGISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:348
+; DAGISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:352
+; DAGISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:356
+; DAGISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:360
+; DAGISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:364
+; DAGISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:368
+; DAGISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:372
+; DAGISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376
+; DAGISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380
+; DAGISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388
+; DAGISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392
+; DAGISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396
+; DAGISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:400
+; DAGISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:404
+; DAGISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:408
+; DAGISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:412
+; DAGISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:416
+; DAGISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:420
+; DAGISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:424
+; DAGISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:428
+; DAGISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:432
+; DAGISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:436
+; DAGISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:440
+; DAGISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:444
+; DAGISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:448
+; DAGISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:452
+; DAGISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:456
+; DAGISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:460
+; DAGISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:464
+; DAGISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:468
+; DAGISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:472
+; DAGISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:476
+; DAGISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:480
+; DAGISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:484
+; DAGISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:488
+; DAGISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:492
+; DAGISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:496
+; DAGISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:500
+; DAGISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504
+; DAGISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508
+; DAGISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512
+; DAGISEL64-NEXT: s_clause 0xf
+; DAGISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516
+; DAGISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520
+; DAGISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524
+; DAGISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:528
+; DAGISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:532
+; DAGISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:536
+; DAGISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:540
+; DAGISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:544
+; DAGISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:548
+; DAGISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:552
+; DAGISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:556
+; DAGISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:560
+; DAGISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:564
+; DAGISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:568
+; DAGISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:572
+; DAGISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:576
+; DAGISEL64-NEXT: s_mov_b64 exec, s[4:5]
+; DAGISEL64-NEXT: s_mov_b32 s33, s0
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: call_gfx_from_whole_wave:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_mov_b32 s0, s33
+; GISEL64-NEXT: s_mov_b32 s33, s32
+; GISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; GISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; GISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; GISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; GISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; GISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; GISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; GISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; GISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; GISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; GISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; GISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; GISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; GISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; GISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; GISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; GISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; GISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; GISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; GISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; GISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; GISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; GISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; GISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; GISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; GISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; GISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; GISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; GISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; GISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; GISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; GISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; GISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; GISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; GISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; GISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; GISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; GISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; GISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; GISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:164
+; GISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:168
+; GISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:172
+; GISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:176
+; GISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:180
+; GISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:184
+; GISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:188
+; GISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:192
+; GISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:196
+; GISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:200
+; GISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:204
+; GISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:208
+; GISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:212
+; GISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:216
+; GISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:220
+; GISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:224
+; GISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:228
+; GISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:232
+; GISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:236
+; GISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:240
+; GISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:244
+; GISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248
+; GISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252
+; GISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260
+; GISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264
+; GISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268
+; GISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:272
+; GISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:276
+; GISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:280
+; GISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:284
+; GISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:288
+; GISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:292
+; GISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:296
+; GISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:300
+; GISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:304
+; GISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:308
+; GISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:312
+; GISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:316
+; GISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:320
+; GISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:324
+; GISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:328
+; GISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:332
+; GISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:336
+; GISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:340
+; GISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:344
+; GISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:348
+; GISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:352
+; GISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:356
+; GISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:360
+; GISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:364
+; GISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:368
+; GISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:372
+; GISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376
+; GISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380
+; GISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388
+; GISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392
+; GISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396
+; GISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:400
+; GISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:404
+; GISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:408
+; GISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:412
+; GISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:416
+; GISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:420
+; GISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:424
+; GISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:428
+; GISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:432
+; GISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:436
+; GISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:440
+; GISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:444
+; GISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:448
+; GISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:452
+; GISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:456
+; GISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:460
+; GISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:464
+; GISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:468
+; GISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:472
+; GISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:476
+; GISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:480
+; GISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:484
+; GISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:488
+; GISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:492
+; GISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:496
+; GISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:500
+; GISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504
+; GISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508
+; GISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512
+; GISEL64-NEXT: s_clause 0xf
+; GISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516
+; GISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520
+; GISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524
+; GISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:528
+; GISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:532
+; GISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:536
+; GISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:540
+; GISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:544
+; GISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:548
+; GISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:552
+; GISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:556
+; GISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:560
+; GISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:564
+; GISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:568
+; GISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:572
+; GISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:576
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_writelane_b32 v40, s0, 4
+; GISEL64-NEXT: v_mov_b32_e32 v2, v0
+; GISEL64-NEXT: v_swap_b32 v0, v1
+; GISEL64-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo
+; GISEL64-NEXT: v_writelane_b32 v40, s4, 0
+; GISEL64-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi
+; GISEL64-NEXT: s_addk_co_i32 s32, 0x250
+; GISEL64-NEXT: v_writelane_b32 v40, s5, 1
+; GISEL64-NEXT: v_writelane_b32 v40, s30, 2
+; GISEL64-NEXT: v_writelane_b32 v40, s31, 3
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_readlane_b32 s31, v40, 3
+; GISEL64-NEXT: v_readlane_b32 s30, v40, 2
+; GISEL64-NEXT: v_readlane_b32 s5, v40, 1
+; GISEL64-NEXT: v_readlane_b32 s4, v40, 0
+; GISEL64-NEXT: v_readlane_b32 s0, v40, 4
+; GISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_mov_b32 s32, s33
+; GISEL64-NEXT: s_xor_b64 exec, s[4:5], -1
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; GISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; GISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; GISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; GISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; GISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; GISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; GISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; GISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; GISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; GISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; GISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; GISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; GISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; GISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; GISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; GISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; GISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; GISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; GISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; GISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; GISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; GISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; GISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; GISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; GISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; GISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; GISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; GISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; GISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; GISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; GISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; GISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; GISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; GISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; GISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; GISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; GISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; GISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; GISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:164
+; GISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:168
+; GISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:172
+; GISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:176
+; GISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:180
+; GISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:184
+; GISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:188
+; GISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:192
+; GISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:196
+; GISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:200
+; GISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:204
+; GISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:208
+; GISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:212
+; GISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:216
+; GISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:220
+; GISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:224
+; GISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:228
+; GISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:232
+; GISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:236
+; GISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:240
+; GISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:244
+; GISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248
+; GISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252
+; GISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260
+; GISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264
+; GISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268
+; GISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:272
+; GISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:276
+; GISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:280
+; GISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:284
+; GISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:288
+; GISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:292
+; GISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:296
+; GISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:300
+; GISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:304
+; GISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:308
+; GISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:312
+; GISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:316
+; GISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:320
+; GISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:324
+; GISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:328
+; GISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:332
+; GISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:336
+; GISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:340
+; GISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:344
+; GISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:348
+; GISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:352
+; GISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:356
+; GISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:360
+; GISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:364
+; GISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:368
+; GISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:372
+; GISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376
+; GISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380
+; GISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388
+; GISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392
+; GISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396
+; GISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:400
+; GISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:404
+; GISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:408
+; GISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:412
+; GISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:416
+; GISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:420
+; GISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:424
+; GISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:428
+; GISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:432
+; GISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:436
+; GISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:440
+; GISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:444
+; GISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:448
+; GISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:452
+; GISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:456
+; GISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:460
+; GISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:464
+; GISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:468
+; GISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:472
+; GISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:476
+; GISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:480
+; GISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:484
+; GISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:488
+; GISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:492
+; GISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:496
+; GISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:500
+; GISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504
+; GISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508
+; GISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512
+; GISEL64-NEXT: s_clause 0xf
+; GISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516
+; GISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520
+; GISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524
+; GISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:528
+; GISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:532
+; GISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:536
+; GISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:540
+; GISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:544
+; GISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:548
+; GISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:552
+; GISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:556
+; GISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:560
+; GISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:564
+; GISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:568
+; GISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:572
+; GISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:576
+; GISEL64-NEXT: s_mov_b64 exec, s[4:5]
+; GISEL64-NEXT: s_mov_b32 s33, s0
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
+ ret <2 x half> %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
index cb3a0e1..06c4518 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; The test forces a high vector register pressure and there won't be sufficient VGPRs to be allocated
; for writelane/readlane SGPR spill instructions. Regalloc would split the vector register liverange
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
index 1f6e3a9..9e9fe180 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-O0 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 --o - %s | FileCheck -check-prefix=GCN-O0 %s
; Test whole-wave register spilling.
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 77d1e6c..04a5cac 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s
-; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
-; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope --check-prefix=SI %s
+; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope --check-prefix=VI %s
+; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
; SI-LABEL: widen_i16_constant_load:
diff --git a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
index ce01a9d..1a8f198 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; Check that DAGTypeLegalizer::WidenVSELECTAndMask doesn't try to
; create vselects with i64 condition masks.
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir b/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir
new file mode 100644
index 0000000..2f7a6e2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir
@@ -0,0 +1,902 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1250 %s
+
+# WMMA writes: D0, WMMA reads: A0/B0/Index0
+# VALU writes: D1, VALU reads: Use1
+# Hards could be:
+# RAW: D0 overlaps Use1
+# WAW: D0 overlaps D1
+# WAR: A0/B0/Index0 overlaps D1
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_valus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_valus_in_between
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr26 = V_MOV_B32_e32 26, implicit $exec
+ ; GFX1250-NEXT: $vgpr27 = V_MOV_B32_e32 27, implicit $exec
+ ; GFX1250-NEXT: $vgpr28 = V_MOV_B32_e32 28, implicit $exec
+ ; GFX1250-NEXT: $vgpr29 = V_MOV_B32_e32 29, implicit $exec
+ ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr26 = V_MOV_B32_e32 26, implicit $exec
+ $vgpr27 = V_MOV_B32_e32 27, implicit $exec
+ $vgpr28 = V_MOV_B32_e32 28, implicit $exec
+ $vgpr29 = V_MOV_B32_e32 29, implicit $exec
+ $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_salus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_salus_in_between
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+ ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
+ ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 1
+ $sgpr2 = S_MOV_B32 2
+ $sgpr3 = S_MOV_B32 3
+ $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr16 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_A0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_A0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr0 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_B0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_B0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr8 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_valus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_valus_in_between
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr26 = V_MOV_B32_e32 26, implicit $exec
+ ; GFX1250-NEXT: $vgpr27 = V_MOV_B32_e32 27, implicit $exec
+ ; GFX1250-NEXT: $vgpr28 = V_MOV_B32_e32 28, implicit $exec
+ ; GFX1250-NEXT: $vgpr29 = V_MOV_B32_e32 29, implicit $exec
+ ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr22, $vgpr30, implicit $mode, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr26 = V_MOV_B32_e32 26, implicit $exec
+ $vgpr27 = V_MOV_B32_e32 27, implicit $exec
+ $vgpr28 = V_MOV_B32_e32 28, implicit $exec
+ $vgpr29 = V_MOV_B32_e32 29, implicit $exec
+ $vgpr31 = V_ADD_F32_e32 $vgpr22, $vgpr30, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_salus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_salus_in_between
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+ ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
+ ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 1
+ $sgpr2 = S_MOV_B32 2
+ $sgpr3 = S_MOV_B32 3
+ $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr22 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr22 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_A0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_A0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr0 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_B0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_B0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr8 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_valus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_valus_in_between
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+ ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+ ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+ ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+ ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+ $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+ $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+ $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+ $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_salus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_salus_in_between
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+ ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
+ ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 1
+ $sgpr2 = S_MOV_B32 2
+ $sgpr3 = S_MOV_B32 3
+ $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_NoF8_A0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_A0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_NoF8_B0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_B0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_valus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_valus_in_between
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+ ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+ ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+ ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+ ; GFX1250-NEXT: $vgpr44 = V_MOV_B32_e32 44, implicit $exec
+ ; GFX1250-NEXT: $vgpr45 = V_MOV_B32_e32 45, implicit $exec
+ ; GFX1250-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec
+ ; GFX1250-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec
+ ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+ $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+ $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+ $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+ $vgpr44 = V_MOV_B32_e32 44, implicit $exec
+ $vgpr45 = V_MOV_B32_e32 45, implicit $exec
+ $vgpr46 = V_MOV_B32_e32 46, implicit $exec
+ $vgpr47 = V_MOV_B32_e32 47, implicit $exec
+ $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_salus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_salus_in_between
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+ ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
+ ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3
+ ; GFX1250-NEXT: $sgpr4 = S_MOV_B32 4
+ ; GFX1250-NEXT: $sgpr5 = S_MOV_B32 5
+ ; GFX1250-NEXT: $sgpr6 = S_MOV_B32 6
+ ; GFX1250-NEXT: $sgpr7 = S_MOV_B32 7
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 1
+ $sgpr2 = S_MOV_B32 2
+ $sgpr3 = S_MOV_B32 3
+ $sgpr4 = S_MOV_B32 4
+ $sgpr5 = S_MOV_B32 5
+ $sgpr6 = S_MOV_B32 6
+ $sgpr7 = S_MOV_B32 7
+ $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_A0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_A0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_B0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_B0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec
+ $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_valus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_valus_in_between
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+ ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+ ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+ ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+ ; GFX1250-NEXT: $vgpr44 = V_MOV_B32_e32 44, implicit $exec
+ ; GFX1250-NEXT: $vgpr45 = V_MOV_B32_e32 45, implicit $exec
+ ; GFX1250-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec
+ ; GFX1250-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec
+ ; GFX1250-NEXT: $vgpr49 = V_ADD_F32_e32 $vgpr32, $vgpr48, implicit $mode, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec
+ $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+ $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+ $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+ $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+ $vgpr44 = V_MOV_B32_e32 44, implicit $exec
+ $vgpr45 = V_MOV_B32_e32 45, implicit $exec
+ $vgpr46 = V_MOV_B32_e32 46, implicit $exec
+ $vgpr47 = V_MOV_B32_e32 47, implicit $exec
+ $vgpr49 = V_ADD_F32_e32 $vgpr32, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_salus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_salus_in_between
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+ ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
+ ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3
+ ; GFX1250-NEXT: $sgpr4 = S_MOV_B32 4
+ ; GFX1250-NEXT: $sgpr5 = S_MOV_B32 5
+ ; GFX1250-NEXT: $sgpr6 = S_MOV_B32 6
+ ; GFX1250-NEXT: $sgpr7 = S_MOV_B32 7
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 1
+ $sgpr2 = S_MOV_B32 2
+ $sgpr3 = S_MOV_B32 3
+ $sgpr4 = S_MOV_B32 4
+ $sgpr5 = S_MOV_B32 5
+ $sgpr6 = S_MOV_B32 6
+ $sgpr7 = S_MOV_B32 7
+ $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_A0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_A0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_B0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_B0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_valus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_valus_in_between
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+ ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+ ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+ $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+ $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_salus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_salus_in_between
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 1
+ $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_A0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_A0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr0 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_B0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_B0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr8 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_Index0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_Index0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr32 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_valus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_valus_in_between
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+ ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+ ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+ $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+ $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_salus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_salus_in_between
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 1
+ $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ $vgpr24 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_A0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_A0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ $vgpr0 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_B0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_B0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ $vgpr8 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_Index0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_Index0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr28 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+ $vgpr28 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr33, $vgpr24, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ $vgpr34 = V_ADD_F32_e32 $vgpr33, $vgpr24, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_valus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_valus_in_between
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+ ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+ ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+ ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+ ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+ $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+ $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+ $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+ $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_salus_in_between
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_salus_in_between
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+ ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
+ ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 4
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 1
+ $sgpr2 = S_MOV_B32 2
+ $sgpr3 = S_MOV_B32 4
+ $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ $vgpr24 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_A0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_A0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ $vgpr0 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_B0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_B0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ $vgpr8 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_Index0_overlaps_D1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_Index0_overlaps_D1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
index 2833237..4a01007 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
index c208290..1b44e8f 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
index d99ed8a..9453058 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
index d10dfca..cd7edc2 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
index 6174841..d676252 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index 436825e..53bede8 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX12,GFX12-TRUE16
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GFX12,GFX12-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
index 5b01b17..a8f5726 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
index 616fa39..9303dbf 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
index 311e76b..fdfec74 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
index 901405c..896efb0 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir
new file mode 100644
index 0000000..2032b98
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir
@@ -0,0 +1,1430 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1250 %s
+
+# For two conscutive wmma instructions, we need to insert one V_NOP instruction between
+# them if matrix A, B or index of the second wmma are the same or overlap with previous
+# wmma instruction’s D-matrix.
+
+---
+name: test_wmma_f32_16x16x4_f32_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr4_vgpr5, 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr4_vgpr5, 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x4_f32_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr14_vgpr15, 8, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr14_vgpr15, 8, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x4_f32_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr4_vgpr5, 0, 0, 0, implicit $exec
+ $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr4_vgpr5, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr26_vgpr27, 0, 0, 0, implicit $exec
+ $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr26_vgpr27, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_bf8_fp8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_fp8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_f16_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_f16_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_f16_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+ $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x32_f16_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x32_f16_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x32_f16_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+
+ $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+
+ $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+ $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+ $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+ $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_f16_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_f16_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_f16_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x64_f16_D0_overlaps_A1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_A1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x64_f16_D0_overlaps_B1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_B1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x64_f16_D0_overlaps_Index1
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_Index1
+ ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+ ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
index 4c1eefd..cc3d57c 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck %s
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
index b7b6028..0503fa6 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 < %s | FileCheck %s --check-prefix=W32
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
index 524a25c..138d80d 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=W64
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index 1ab82b0..fc323c6 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
;
diff --git a/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll b/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll
index 82d276e..bd74234 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=CHECK %s
; Test that s_wqm is executed before lds.param.load.
define amdgpu_ps <3 x float> @test_param_load(i32 inreg %attr, <3 x float> %to_add) {
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 1ca2a8a..ad8dcd3 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GFX10-W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GFX10-W32 %s
; Check that WQM isn't triggered by image load/store intrinsics.
define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
diff --git a/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll b/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
index de3b1d5..a1850bc 100644
--- a/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
@@ -1,6 +1,6 @@
; XFAIL: *
; REQUIRES: asserts
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s
; write_register doesn't prevent us from illegally trying to write a
; vgpr value into a scalar register, but I don't think there's much we
diff --git a/llvm/test/CodeGen/AMDGPU/write_register.ll b/llvm/test/CodeGen/AMDGPU/write_register.ll
index f6ac26e..eaf1088 100644
--- a/llvm/test/CodeGen/AMDGPU/write_register.ll
+++ b/llvm/test/CodeGen/AMDGPU/write_register.ll
@@ -1,5 +1,5 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -enable-misched=0 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -enable-misched=0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -enable-misched=0 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -enable-misched=0 < %s | FileCheck %s
declare void @llvm.write_register.i32(metadata, i32) #0
declare void @llvm.write_register.i64(metadata, i64) #0
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index af7d169..f63329b 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O0 -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O0 %s
-; RUN: llc -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O3 %s
+; RUN: llc -O0 -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false < %s | FileCheck -check-prefix=GFX9-O0 %s
+; RUN: llc -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false < %s | FileCheck -check-prefix=GFX9-O3 %s
define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O0-LABEL: strict_wwm_no_cfg:
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 09d19be..7dd03ad 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -O0 -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O0 %s
-; RUN: llc -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O3 %s
+; RUN: llc -O0 -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false < %s | FileCheck -check-prefix=GFX9-O0 %s
+; RUN: llc -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false < %s | FileCheck -check-prefix=GFX9-O3 %s
; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
diff --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll
index 0099a37..b8acdd9 100644
--- a/llvm/test/CodeGen/AMDGPU/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xnor.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GCN-DL %s
; GCN-LABEL: {{^}}scalar_xnor_i32_one_use
; GCN: s_xnor_b32
diff --git a/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll b/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll
index a9f1dc4..3059b5b 100644
--- a/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN %s
; This test used to crash
define amdgpu_ps float @xor3_i1_const(float inreg %arg1, i32 inreg %arg2) {
diff --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll
index 6c5a467..67ef489 100644
--- a/llvm/test/CodeGen/AMDGPU/xor3.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor3.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 | FileCheck -check-prefix=GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 | FileCheck -check-prefix=GFX10 %s
; ===================================================================================
; V_XOR3_B32
diff --git a/llvm/test/CodeGen/AMDGPU/xor_add.ll b/llvm/test/CodeGen/AMDGPU/xor_add.ll
index b88ea55..78a7faa 100644
--- a/llvm/test/CodeGen/AMDGPU/xor_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor_add.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji | FileCheck -check-prefix=VI %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 | FileCheck -check-prefix=GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 | FileCheck -check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 | FileCheck -check-prefix=GFX10 %s
; ===================================================================================
; V_XAD_U32
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index c77828a..f0f8eaa 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
; R600: {{^}}s_mad_zext_i32_to_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
index 45cb7955..c393582 100644
--- a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a, i32 %b) {
; GCN-LABEL: zext_i16_to_i32_uniform:
diff --git a/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
index 14c5642..01a135e 100644
--- a/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
+++ b/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}zext_or_operand_i64:
; GCN: buffer_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
diff --git a/llvm/test/CodeGen/ARM/bad-constraint.ll b/llvm/test/CodeGen/ARM/bad-constraint.ll
new file mode 100644
index 0000000..9b8fcd5
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/bad-constraint.ll
@@ -0,0 +1,25 @@
+; RUN: not llc -filetype=obj %s -o /dev/null 2>&1 | FileCheck %s
+; CHECK: error: couldn't allocate input reg for constraint '{d2}'
+; CHECK-NEXT: error: couldn't allocate input reg for constraint '{s2}'
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8a-unknown-linux-gnueabihf"
+
+@a = local_unnamed_addr global i32 0, align 4
+
+define void @_Z1bv() local_unnamed_addr {
+entry:
+ %0 = load i32, ptr @a, align 4
+ %conv = sext i32 %0 to i64
+ tail call void asm sideeffect "", "{d2}"(i64 %conv)
+ ret void
+}
+
+define void @_Z1cv() local_unnamed_addr {
+entry:
+ %0 = load i32, ptr @a, align 4
+ %conv = sext i32 %0 to i64
+ tail call void asm sideeffect "", "{s2}"(i64 %conv)
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/ARM/inlineasm-int-to-float.ll b/llvm/test/CodeGen/ARM/inlineasm-int-to-float.ll
new file mode 100644
index 0000000..1c301b6
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/inlineasm-int-to-float.ll
@@ -0,0 +1,17 @@
+; RUN: llc -filetype=asm %s -o - | FileCheck %s
+
+; CHECK: movw r0, :lower16:a
+; CHECK-NEXT: movt r0, :upper16:a
+; CHECK-NEXT: vldr s6, [r0]
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8a-unknown-linux-gnueabihf"
+
+@a = local_unnamed_addr global i32 0, align 4
+
+define void @_Z1dv() local_unnamed_addr {
+entry:
+ %0 = load i32, ptr @a, align 4
+ tail call void asm sideeffect "", "{s6}"(i32 %0)
+ ret void
+}
diff --git a/llvm/test/CodeGen/ARM/stack-protector-eh-sjlj.ll b/llvm/test/CodeGen/ARM/stack-protector-eh-sjlj.ll
new file mode 100644
index 0000000..fbd01ca
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/stack-protector-eh-sjlj.ll
@@ -0,0 +1,164 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -mtriple=thumbv7s-apple-darwin < %s | FileCheck %s
+target datalayout = "e-m:o-p:32:32-Fi8-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+
+; Function Attrs: mustprogress noinline optnone ssp
+define ptr @foo() #0 personality ptr @__gxx_personality_sj0 {
+; CHECK-LABEL: foo:
+; CHECK: Lfunc_begin0:
+; CHECK-NEXT: @ %bb.0:
+; CHECK-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-NEXT: add r7, sp, #12
+; CHECK-NEXT: push.w {r8, r10, r11}
+; CHECK-NEXT: sub.w r4, sp, #64
+; CHECK-NEXT: bfc r4, #0, #4
+; CHECK-NEXT: mov sp, r4
+; CHECK-NEXT: vst1.64 {d8, d9, d10, d11}, [r4:128]!
+; CHECK-NEXT: vst1.64 {d12, d13, d14, d15}, [r4:128]
+; CHECK-NEXT: sub sp, #96
+; CHECK-NEXT: movw r0, :lower16:(L___stack_chk_guard$non_lazy_ptr-(LPC0_2+4))
+; CHECK-NEXT: movt r0, :upper16:(L___stack_chk_guard$non_lazy_ptr-(LPC0_2+4))
+; CHECK-NEXT: LPC0_2:
+; CHECK-NEXT: add r0, pc
+; CHECK-NEXT: ldr r0, [r0]
+; CHECK-NEXT: ldr r0, [r0]
+; CHECK-NEXT: movw r0, :lower16:(L___stack_chk_guard$non_lazy_ptr-(LPC0_3+4))
+; CHECK-NEXT: movt r0, :upper16:(L___stack_chk_guard$non_lazy_ptr-(LPC0_3+4))
+; CHECK-NEXT: LPC0_3:
+; CHECK-NEXT: add r0, pc
+; CHECK-NEXT: ldr r0, [r0]
+; CHECK-NEXT: ldr r0, [r0]
+; CHECK-NEXT: str r0, [sp, #92]
+; CHECK-NEXT: movw r0, :lower16:(L___gxx_personality_sj0$non_lazy_ptr-(LPC0_4+4))
+; CHECK-NEXT: movt r0, :upper16:(L___gxx_personality_sj0$non_lazy_ptr-(LPC0_4+4))
+; CHECK-NEXT: LPC0_4:
+; CHECK-NEXT: add r0, pc
+; CHECK-NEXT: ldr r0, [r0]
+; CHECK-NEXT: str r0, [sp, #36]
+; CHECK-NEXT: ldr r0, LCPI0_0
+; CHECK-NEXT: LPC0_0:
+; CHECK-NEXT: add r0, pc
+; CHECK-NEXT: str r0, [sp, #40]
+; CHECK-NEXT: str r7, [sp, #44]
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: str r0, [sp, #52]
+; CHECK-NEXT: ldr r0, LCPI0_1
+; CHECK-NEXT: orr r0, r0, #1
+; CHECK-NEXT: LPC0_1:
+; CHECK-NEXT: add r0, pc
+; CHECK-NEXT: str r0, [sp, #48]
+; CHECK-NEXT: add r0, sp, #12
+; CHECK-NEXT: bl __Unwind_SjLj_Register
+; CHECK-NEXT: movs r0, #1
+; CHECK-NEXT: str r0, [sp, #16]
+; CHECK-NEXT: movw r0, :lower16:(L___stack_chk_guard$non_lazy_ptr-(LPC0_5+4))
+; CHECK-NEXT: movt r0, :upper16:(L___stack_chk_guard$non_lazy_ptr-(LPC0_5+4))
+; CHECK-NEXT: LPC0_5:
+; CHECK-NEXT: add r0, pc
+; CHECK-NEXT: ldr r0, [r0]
+; CHECK-NEXT: ldr r0, [r0]
+; CHECK-NEXT: ldr r1, [sp, #92]
+; CHECK-NEXT: cmp r0, r1
+; CHECK-NEXT: bne LBB0_7
+; CHECK-NEXT: @ %bb.1: @ %SP_return
+; CHECK-NEXT: Ltmp0:
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bl _foo2
+; CHECK-NEXT: Ltmp1:
+; CHECK-NEXT: b LBB0_2
+; CHECK-NEXT: LBB0_2:
+; CHECK-NEXT: movs r0, #2
+; CHECK-NEXT: str r0, [sp, #16]
+; CHECK-NEXT: movw r0, :lower16:(L___stack_chk_guard$non_lazy_ptr-(LPC0_6+4))
+; CHECK-NEXT: movt r0, :upper16:(L___stack_chk_guard$non_lazy_ptr-(LPC0_6+4))
+; CHECK-NEXT: LPC0_6:
+; CHECK-NEXT: add r0, pc
+; CHECK-NEXT: ldr r0, [r0]
+; CHECK-NEXT: ldr r0, [r0]
+; CHECK-NEXT: ldr r1, [sp, #92]
+; CHECK-NEXT: cmp r0, r1
+; CHECK-NEXT: bne LBB0_7
+; CHECK-NEXT: @ %bb.3: @ %SP_return2
+; CHECK-NEXT: Ltmp2:
+; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: mov r1, r2
+; CHECK-NEXT: bl _foo3
+; CHECK-NEXT: Ltmp3:
+; CHECK-NEXT: b LBB0_6
+; CHECK-NEXT: LBB0_4:
+; CHECK-NEXT: Ltmp4:
+; CHECK-NEXT: ldr r0, [sp, #20]
+; CHECK-NEXT: ldr r0, [sp, #24]
+; CHECK-NEXT: add r0, sp, #12
+; CHECK-NEXT: bl __Unwind_SjLj_Unregister
+; CHECK-NEXT: movw r0, :lower16:(L___stack_chk_guard$non_lazy_ptr-(LPC0_7+4))
+; CHECK-NEXT: movt r0, :upper16:(L___stack_chk_guard$non_lazy_ptr-(LPC0_7+4))
+; CHECK-NEXT: LPC0_7:
+; CHECK-NEXT: add r0, pc
+; CHECK-NEXT: ldr r0, [r0]
+; CHECK-NEXT: ldr r0, [r0]
+; CHECK-NEXT: ldr r1, [sp, #92]
+; CHECK-NEXT: cmp r0, r1
+; CHECK-NEXT: bne LBB0_7
+; CHECK-NEXT: @ %bb.5: @ %SP_return3
+; CHECK-NEXT: movs r0, #0
+; CHECK-NEXT: add r4, sp, #96
+; CHECK-NEXT: vld1.64 {d8, d9, d10, d11}, [r4:128]!
+; CHECK-NEXT: vld1.64 {d12, d13, d14, d15}, [r4:128]
+; CHECK-NEXT: sub.w r4, r7, #24
+; CHECK-NEXT: mov sp, r4
+; CHECK-NEXT: pop.w {r8, r10, r11}
+; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT: LBB0_6:
+; CHECK-NEXT: trap
+; CHECK-NEXT: LBB0_7: @ %CallStackCheckFailBlk
+; CHECK-NEXT: bl ___stack_chk_fail
+; CHECK-NEXT: LBB0_8:
+; CHECK-NEXT: ldr r0, [sp, #16]
+; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: cmp r0, #2
+; CHECK-NEXT: bhi LBB0_12
+; CHECK-NEXT: @ %bb.9:
+; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: LCPI0_2:
+; CHECK-NEXT: tbb [pc, r1]
+; CHECK-NEXT: @ %bb.10:
+; CHECK-NEXT: LJTI0_0:
+; CHECK-NEXT: .data_region jt8
+; CHECK-NEXT: .byte (LBB0_11-(LCPI0_2+4))/2
+; CHECK-NEXT: .byte (LBB0_11-(LCPI0_2+4))/2
+; CHECK-NEXT: .end_data_region
+; CHECK-NEXT: .p2align 1
+; CHECK-NEXT: LBB0_11:
+; CHECK-NEXT: b LBB0_4
+; CHECK-NEXT: LBB0_12:
+; CHECK-NEXT: trap
+; CHECK-NEXT: .p2align 2
+; CHECK-NEXT: @ %bb.13:
+ %1 = alloca [14 x i8], align 16
+ %2 = invoke i32 @"foo2"(ptr null, ptr null) #1
+ to label %3 unwind label %4
+
+3: ; preds = %0
+ invoke void @"foo3"(ptr null, ptr null, ptr null) #2
+ to label %6 unwind label %4
+
+4: ; preds = %3, %0
+ %5 = landingpad { ptr, i32 }
+ cleanup
+ ret ptr null
+
+6: ; preds = %3
+ unreachable
+}
+
+declare i32 @__gxx_personality_sj0(...)
+declare i32 @foo2(ptr,ptr)
+declare void @foo3(ptr,ptr,ptr)
+; uselistorder directives
+uselistorder ptr null, { 2, 3, 4, 5, 0, 6, 7, 1, 8, 9 }
+
+attributes #0 = { mustprogress ssp "frame-pointer"="all" "no-builtin-calloc" "no-builtin-stpcpy" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #2 = { noreturn }
diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll
index 3562b93..9e1aa10 100644
--- a/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll
+++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll
@@ -1,28 +1,21 @@
; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s
-; RUN: llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - | llvm-objdump --mcpu=avr25 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR25 %s
; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s
; ATTINY85: <main>:
; ATTINY85-NEXT: andi r24, 0x1
; ATTINY85: cpi r24, 0x0
-; ATTINY85-NEXT: breq .+2
-; ATTINY85-NEXT: rjmp .+4086
+; ATTINY85-NEXT: breq .-2
+; ATTINY85-NEXT: R_AVR_7_PCREL .text+0x100c
+; ATTINY85-NEXT: rjmp .-2
+; ATTINY85-NEXT: R_AVR_13_PCREL .text+0x2
; ATTINY85: ldi r24, 0x3
; ATTINY85-NEXT: ret
-; AVR25: <main>:
-; AVR25-NEXT: andi r24, 0x1
-; AVR25: cpi r24, 0x0
-; AVR25-NEXT: breq .+2
-; AVR25-NEXT: rjmp .-2
-; AVR25-NEXT: R_AVR_13_PCREL .text+0x2
-; AVR25: ldi r24, 0x3
-; AVR25-NEXT: ret
-
; AVR3: <main>:
; AVR3-NEXT: andi r24, 0x1
; AVR3: cpi r24, 0x0
-; AVR3-NEXT: breq .+4
+; AVR3-NEXT: breq .-2
+; AVR3-NEXT: R_AVR_7_PCREL .text+0x100e
; AVR3-NEXT: jmp 0x0
; AVR3-NEXT: R_AVR_CALL .text+0x2
; AVR3: ldi r24, 0x3
diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll
index a51cf42..1fc84a7 100644
--- a/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll
+++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll
@@ -1,28 +1,21 @@
; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s
-; RUN: llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - | llvm-objdump --mcpu=avr25 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR25 %s
; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s
; ATTINY85: <main>:
; ATTINY85-NEXT: andi r24, 0x1
; ATTINY85-NEXT: cpi r24, 0x0
-; ATTINY85-NEXT: brne .+2
-; ATTINY85-NEXT: rjmp .-4092
+; ATTINY85-NEXT: brne .-2
+; ATTINY85-NEXT: R_AVR_7_PCREL .text+0x8
+; ATTINY85-NEXT: rjmp .-2
+; ATTINY85-NEXT: R_AVR_13_PCREL .text+0x100c
; ATTINY85: ldi r24, 0x3
; ATTINY85-NEXT: ret
-; AVR25: <main>:
-; AVR25-NEXT: andi r24, 0x1
-; AVR25-NEXT: cpi r24, 0x0
-; AVR25-NEXT: brne .+2
-; AVR25-NEXT: rjmp .-2
-; AVR25-NEXT: R_AVR_13_PCREL .text+0x100c
-; AVR25: ldi r24, 0x3
-; AVR25-NEXT: ret
-
; AVR3: <main>:
; AVR3-NEXT: andi r24, 0x1
; AVR3-NEXT: cpi r24, 0x0
-; AVR3-NEXT: brne .+4
+; AVR3-NEXT: brne .-2
+; AVR3-NEXT: R_AVR_7_PCREL .text+0xa
; AVR3-NEXT: jmp 0x0
; AVR3-NEXT: R_AVR_CALL .text+0x100e
; AVR3: ldi r24, 0x3
diff --git a/llvm/test/CodeGen/AVR/jmp.ll b/llvm/test/CodeGen/AVR/jmp.ll
index 95dfff4..1cbc637 100644
--- a/llvm/test/CodeGen/AVR/jmp.ll
+++ b/llvm/test/CodeGen/AVR/jmp.ll
@@ -18,7 +18,8 @@ declare i8 @bar(i8);
; CHECK: rcall .-2
; CHECK-NEXT: 00000000: R_AVR_13_PCREL bar
; CHECK-NEXT: cpi r24, 0x7b
-; CHECK-NEXT: brne .+4
+; CHECK-NEXT: brne .-2
+; CHECK-NEXT: R_AVR_7_PCREL .text+0xa
; CHECK-NEXT: ldi r24, 0x64
; CHECK-NEXT: ret
; CHECK-NEXT: ldi r24, 0xc8
diff --git a/llvm/test/CodeGen/AVR/llvm.sincos.ll b/llvm/test/CodeGen/AVR/llvm.sincos.ll
new file mode 100644
index 0000000..897101d
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/llvm.sincos.ll
@@ -0,0 +1,883 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=avr-unknown-unknown < %s | FileCheck -check-prefixes=CHECK,NONGNU %s
+; RUN: llc -mtriple=avr-unknown-linux-gnu < %s | FileCheck -check-prefixes=CHECK,GNU %s
+
+define { half, half } @test_sincos_f16(half %a) #0 {
+; NONGNU-LABEL: test_sincos_f16:
+; NONGNU: ; %bb.0:
+; NONGNU-NEXT: push r12
+; NONGNU-NEXT: push r13
+; NONGNU-NEXT: push r14
+; NONGNU-NEXT: push r15
+; NONGNU-NEXT: push r16
+; NONGNU-NEXT: push r17
+; NONGNU-NEXT: mov r24, r22
+; NONGNU-NEXT: mov r25, r23
+; NONGNU-NEXT: rcall __extendhfsf2
+; NONGNU-NEXT: mov r16, r22
+; NONGNU-NEXT: mov r17, r23
+; NONGNU-NEXT: mov r14, r24
+; NONGNU-NEXT: mov r15, r25
+; NONGNU-NEXT: rcall sin
+; NONGNU-NEXT: rcall __truncsfhf2
+; NONGNU-NEXT: mov r12, r24
+; NONGNU-NEXT: mov r13, r25
+; NONGNU-NEXT: mov r22, r16
+; NONGNU-NEXT: mov r23, r17
+; NONGNU-NEXT: mov r24, r14
+; NONGNU-NEXT: mov r25, r15
+; NONGNU-NEXT: rcall cos
+; NONGNU-NEXT: rcall __truncsfhf2
+; NONGNU-NEXT: mov r22, r24
+; NONGNU-NEXT: mov r23, r25
+; NONGNU-NEXT: mov r18, r12
+; NONGNU-NEXT: mov r19, r13
+; NONGNU-NEXT: pop r17
+; NONGNU-NEXT: pop r16
+; NONGNU-NEXT: pop r15
+; NONGNU-NEXT: pop r14
+; NONGNU-NEXT: pop r13
+; NONGNU-NEXT: pop r12
+; NONGNU-NEXT: ret
+;
+; GNU-LABEL: test_sincos_f16:
+; GNU: ; %bb.0:
+; GNU-NEXT: push r16
+; GNU-NEXT: push r17
+; GNU-NEXT: push r28
+; GNU-NEXT: push r29
+; GNU-NEXT: in r28, 61
+; GNU-NEXT: in r29, 62
+; GNU-NEXT: sbiw r28, 8
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: mov r24, r22
+; GNU-NEXT: mov r25, r23
+; GNU-NEXT: rcall __extendhfsf2
+; GNU-NEXT: mov r20, r28
+; GNU-NEXT: mov r21, r29
+; GNU-NEXT: subi r20, 251
+; GNU-NEXT: sbci r21, 255
+; GNU-NEXT: mov r18, r28
+; GNU-NEXT: mov r19, r29
+; GNU-NEXT: subi r18, 255
+; GNU-NEXT: sbci r19, 255
+; GNU-NEXT: rcall sincosf
+; GNU-NEXT: ldd r22, Y+5
+; GNU-NEXT: ldd r23, Y+6
+; GNU-NEXT: ldd r24, Y+7
+; GNU-NEXT: ldd r25, Y+8
+; GNU-NEXT: rcall __truncsfhf2
+; GNU-NEXT: mov r16, r24
+; GNU-NEXT: mov r17, r25
+; GNU-NEXT: ldd r22, Y+1
+; GNU-NEXT: ldd r23, Y+2
+; GNU-NEXT: ldd r24, Y+3
+; GNU-NEXT: ldd r25, Y+4
+; GNU-NEXT: rcall __truncsfhf2
+; GNU-NEXT: mov r22, r24
+; GNU-NEXT: mov r23, r25
+; GNU-NEXT: mov r18, r16
+; GNU-NEXT: mov r19, r17
+; GNU-NEXT: adiw r28, 8
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: pop r29
+; GNU-NEXT: pop r28
+; GNU-NEXT: pop r17
+; GNU-NEXT: pop r16
+; GNU-NEXT: ret
+ %result = call { half, half } @llvm.sincos.f16(half %a)
+ ret { half, half } %result
+}
+
+define half @test_sincos_f16_only_use_sin(half %a) #0 {
+; NONGNU-LABEL: test_sincos_f16_only_use_sin:
+; NONGNU: ; %bb.0:
+; NONGNU-NEXT: mov r24, r22
+; NONGNU-NEXT: mov r25, r23
+; NONGNU-NEXT: rcall __extendhfsf2
+; NONGNU-NEXT: rcall sin
+; NONGNU-NEXT: rcall __truncsfhf2
+; NONGNU-NEXT: mov r22, r24
+; NONGNU-NEXT: mov r23, r25
+; NONGNU-NEXT: ret
+;
+; GNU-LABEL: test_sincos_f16_only_use_sin:
+; GNU: ; %bb.0:
+; GNU-NEXT: push r28
+; GNU-NEXT: push r29
+; GNU-NEXT: in r28, 61
+; GNU-NEXT: in r29, 62
+; GNU-NEXT: sbiw r28, 8
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: mov r24, r22
+; GNU-NEXT: mov r25, r23
+; GNU-NEXT: rcall __extendhfsf2
+; GNU-NEXT: mov r20, r28
+; GNU-NEXT: mov r21, r29
+; GNU-NEXT: subi r20, 251
+; GNU-NEXT: sbci r21, 255
+; GNU-NEXT: mov r18, r28
+; GNU-NEXT: mov r19, r29
+; GNU-NEXT: subi r18, 255
+; GNU-NEXT: sbci r19, 255
+; GNU-NEXT: rcall sincosf
+; GNU-NEXT: ldd r22, Y+5
+; GNU-NEXT: ldd r23, Y+6
+; GNU-NEXT: ldd r24, Y+7
+; GNU-NEXT: ldd r25, Y+8
+; GNU-NEXT: rcall __truncsfhf2
+; GNU-NEXT: mov r22, r24
+; GNU-NEXT: mov r23, r25
+; GNU-NEXT: adiw r28, 8
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: pop r29
+; GNU-NEXT: pop r28
+; GNU-NEXT: ret
+ %result = call { half, half } @llvm.sincos.f16(half %a)
+ %result.0 = extractvalue { half, half } %result, 0
+ ret half %result.0
+}
+
+define half @test_sincos_f16_only_use_cos(half %a) #0 {
+; NONGNU-LABEL: test_sincos_f16_only_use_cos:
+; NONGNU: ; %bb.0:
+; NONGNU-NEXT: mov r24, r22
+; NONGNU-NEXT: mov r25, r23
+; NONGNU-NEXT: rcall __extendhfsf2
+; NONGNU-NEXT: rcall cos
+; NONGNU-NEXT: rcall __truncsfhf2
+; NONGNU-NEXT: mov r22, r24
+; NONGNU-NEXT: mov r23, r25
+; NONGNU-NEXT: ret
+;
+; GNU-LABEL: test_sincos_f16_only_use_cos:
+; GNU: ; %bb.0:
+; GNU-NEXT: push r28
+; GNU-NEXT: push r29
+; GNU-NEXT: in r28, 61
+; GNU-NEXT: in r29, 62
+; GNU-NEXT: sbiw r28, 8
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: mov r24, r22
+; GNU-NEXT: mov r25, r23
+; GNU-NEXT: rcall __extendhfsf2
+; GNU-NEXT: mov r20, r28
+; GNU-NEXT: mov r21, r29
+; GNU-NEXT: subi r20, 251
+; GNU-NEXT: sbci r21, 255
+; GNU-NEXT: mov r18, r28
+; GNU-NEXT: mov r19, r29
+; GNU-NEXT: subi r18, 255
+; GNU-NEXT: sbci r19, 255
+; GNU-NEXT: rcall sincosf
+; GNU-NEXT: ldd r22, Y+1
+; GNU-NEXT: ldd r23, Y+2
+; GNU-NEXT: ldd r24, Y+3
+; GNU-NEXT: ldd r25, Y+4
+; GNU-NEXT: rcall __truncsfhf2
+; GNU-NEXT: mov r22, r24
+; GNU-NEXT: mov r23, r25
+; GNU-NEXT: adiw r28, 8
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: pop r29
+; GNU-NEXT: pop r28
+; GNU-NEXT: ret
+ %result = call { half, half } @llvm.sincos.f16(half %a)
+ %result.1 = extractvalue { half, half } %result, 1
+ ret half %result.1
+}
+
+define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) #0 {
+; NONGNU-LABEL: test_sincos_v2f16:
+; NONGNU: ; %bb.0:
+; NONGNU-NEXT: push r6
+; NONGNU-NEXT: push r7
+; NONGNU-NEXT: push r8
+; NONGNU-NEXT: push r9
+; NONGNU-NEXT: push r10
+; NONGNU-NEXT: push r11
+; NONGNU-NEXT: push r12
+; NONGNU-NEXT: push r13
+; NONGNU-NEXT: push r14
+; NONGNU-NEXT: push r15
+; NONGNU-NEXT: push r16
+; NONGNU-NEXT: push r17
+; NONGNU-NEXT: mov r10, r22
+; NONGNU-NEXT: mov r11, r23
+; NONGNU-NEXT: rcall __extendhfsf2
+; NONGNU-NEXT: mov r16, r22
+; NONGNU-NEXT: mov r17, r23
+; NONGNU-NEXT: mov r14, r24
+; NONGNU-NEXT: mov r15, r25
+; NONGNU-NEXT: rcall sin
+; NONGNU-NEXT: rcall __truncsfhf2
+; NONGNU-NEXT: mov r12, r24
+; NONGNU-NEXT: mov r13, r25
+; NONGNU-NEXT: mov r24, r10
+; NONGNU-NEXT: mov r25, r11
+; NONGNU-NEXT: rcall __extendhfsf2
+; NONGNU-NEXT: mov r10, r22
+; NONGNU-NEXT: mov r11, r23
+; NONGNU-NEXT: mov r8, r24
+; NONGNU-NEXT: mov r9, r25
+; NONGNU-NEXT: rcall cos
+; NONGNU-NEXT: rcall __truncsfhf2
+; NONGNU-NEXT: mov r6, r24
+; NONGNU-NEXT: mov r7, r25
+; NONGNU-NEXT: mov r22, r10
+; NONGNU-NEXT: mov r23, r11
+; NONGNU-NEXT: mov r24, r8
+; NONGNU-NEXT: mov r25, r9
+; NONGNU-NEXT: rcall sin
+; NONGNU-NEXT: rcall __truncsfhf2
+; NONGNU-NEXT: mov r10, r24
+; NONGNU-NEXT: mov r11, r25
+; NONGNU-NEXT: mov r22, r16
+; NONGNU-NEXT: mov r23, r17
+; NONGNU-NEXT: mov r24, r14
+; NONGNU-NEXT: mov r25, r15
+; NONGNU-NEXT: rcall cos
+; NONGNU-NEXT: rcall __truncsfhf2
+; NONGNU-NEXT: mov r18, r10
+; NONGNU-NEXT: mov r19, r11
+; NONGNU-NEXT: mov r20, r12
+; NONGNU-NEXT: mov r21, r13
+; NONGNU-NEXT: mov r22, r6
+; NONGNU-NEXT: mov r23, r7
+; NONGNU-NEXT: pop r17
+; NONGNU-NEXT: pop r16
+; NONGNU-NEXT: pop r15
+; NONGNU-NEXT: pop r14
+; NONGNU-NEXT: pop r13
+; NONGNU-NEXT: pop r12
+; NONGNU-NEXT: pop r11
+; NONGNU-NEXT: pop r10
+; NONGNU-NEXT: pop r9
+; NONGNU-NEXT: pop r8
+; NONGNU-NEXT: pop r7
+; NONGNU-NEXT: pop r6
+; NONGNU-NEXT: ret
+;
+; GNU-LABEL: test_sincos_v2f16:
+; GNU: ; %bb.0:
+; GNU-NEXT: push r12
+; GNU-NEXT: push r13
+; GNU-NEXT: push r14
+; GNU-NEXT: push r15
+; GNU-NEXT: push r16
+; GNU-NEXT: push r17
+; GNU-NEXT: push r28
+; GNU-NEXT: push r29
+; GNU-NEXT: in r28, 61
+; GNU-NEXT: in r29, 62
+; GNU-NEXT: sbiw r28, 16
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: mov r16, r24
+; GNU-NEXT: mov r17, r25
+; GNU-NEXT: mov r24, r22
+; GNU-NEXT: mov r25, r23
+; GNU-NEXT: rcall __extendhfsf2
+; GNU-NEXT: mov r20, r28
+; GNU-NEXT: mov r21, r29
+; GNU-NEXT: subi r20, 243
+; GNU-NEXT: sbci r21, 255
+; GNU-NEXT: mov r18, r28
+; GNU-NEXT: mov r19, r29
+; GNU-NEXT: subi r18, 247
+; GNU-NEXT: sbci r19, 255
+; GNU-NEXT: rcall sincosf
+; GNU-NEXT: mov r24, r16
+; GNU-NEXT: mov r25, r17
+; GNU-NEXT: rcall __extendhfsf2
+; GNU-NEXT: mov r20, r28
+; GNU-NEXT: mov r21, r29
+; GNU-NEXT: subi r20, 251
+; GNU-NEXT: sbci r21, 255
+; GNU-NEXT: mov r18, r28
+; GNU-NEXT: mov r19, r29
+; GNU-NEXT: subi r18, 255
+; GNU-NEXT: sbci r19, 255
+; GNU-NEXT: rcall sincosf
+; GNU-NEXT: ldd r22, Y+13
+; GNU-NEXT: ldd r23, Y+14
+; GNU-NEXT: ldd r24, Y+15
+; GNU-NEXT: ldd r25, Y+16
+; GNU-NEXT: rcall __truncsfhf2
+; GNU-NEXT: mov r16, r24
+; GNU-NEXT: mov r17, r25
+; GNU-NEXT: ldd r22, Y+5
+; GNU-NEXT: ldd r23, Y+6
+; GNU-NEXT: ldd r24, Y+7
+; GNU-NEXT: ldd r25, Y+8
+; GNU-NEXT: rcall __truncsfhf2
+; GNU-NEXT: mov r14, r24
+; GNU-NEXT: mov r15, r25
+; GNU-NEXT: ldd r22, Y+9
+; GNU-NEXT: ldd r23, Y+10
+; GNU-NEXT: ldd r24, Y+11
+; GNU-NEXT: ldd r25, Y+12
+; GNU-NEXT: rcall __truncsfhf2
+; GNU-NEXT: mov r12, r24
+; GNU-NEXT: mov r13, r25
+; GNU-NEXT: ldd r22, Y+1
+; GNU-NEXT: ldd r23, Y+2
+; GNU-NEXT: ldd r24, Y+3
+; GNU-NEXT: ldd r25, Y+4
+; GNU-NEXT: rcall __truncsfhf2
+; GNU-NEXT: mov r18, r16
+; GNU-NEXT: mov r19, r17
+; GNU-NEXT: mov r20, r14
+; GNU-NEXT: mov r21, r15
+; GNU-NEXT: mov r22, r12
+; GNU-NEXT: mov r23, r13
+; GNU-NEXT: adiw r28, 16
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: pop r29
+; GNU-NEXT: pop r28
+; GNU-NEXT: pop r17
+; GNU-NEXT: pop r16
+; GNU-NEXT: pop r15
+; GNU-NEXT: pop r14
+; GNU-NEXT: pop r13
+; GNU-NEXT: pop r12
+; GNU-NEXT: ret
+ %result = call { <2 x half>, <2 x half> } @llvm.sincos.v2f16(<2 x half> %a)
+ ret { <2 x half>, <2 x half> } %result
+}
+
+define { float, float } @test_sincos_f32(float %a) #0 {
+; NONGNU-LABEL: test_sincos_f32:
+; NONGNU: ; %bb.0:
+; NONGNU-NEXT: push r10
+; NONGNU-NEXT: push r11
+; NONGNU-NEXT: push r12
+; NONGNU-NEXT: push r13
+; NONGNU-NEXT: push r14
+; NONGNU-NEXT: push r15
+; NONGNU-NEXT: push r16
+; NONGNU-NEXT: push r17
+; NONGNU-NEXT: mov r16, r24
+; NONGNU-NEXT: mov r17, r25
+; NONGNU-NEXT: mov r14, r22
+; NONGNU-NEXT: mov r15, r23
+; NONGNU-NEXT: rcall sin
+; NONGNU-NEXT: mov r12, r22
+; NONGNU-NEXT: mov r13, r23
+; NONGNU-NEXT: mov r10, r24
+; NONGNU-NEXT: mov r11, r25
+; NONGNU-NEXT: mov r22, r14
+; NONGNU-NEXT: mov r23, r15
+; NONGNU-NEXT: mov r24, r16
+; NONGNU-NEXT: mov r25, r17
+; NONGNU-NEXT: rcall cos
+; NONGNU-NEXT: mov r18, r12
+; NONGNU-NEXT: mov r19, r13
+; NONGNU-NEXT: mov r20, r10
+; NONGNU-NEXT: mov r21, r11
+; NONGNU-NEXT: pop r17
+; NONGNU-NEXT: pop r16
+; NONGNU-NEXT: pop r15
+; NONGNU-NEXT: pop r14
+; NONGNU-NEXT: pop r13
+; NONGNU-NEXT: pop r12
+; NONGNU-NEXT: pop r11
+; NONGNU-NEXT: pop r10
+; NONGNU-NEXT: ret
+;
+; GNU-LABEL: test_sincos_f32:
+; GNU: ; %bb.0:
+; GNU-NEXT: push r28
+; GNU-NEXT: push r29
+; GNU-NEXT: in r28, 61
+; GNU-NEXT: in r29, 62
+; GNU-NEXT: sbiw r28, 8
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: mov r20, r28
+; GNU-NEXT: mov r21, r29
+; GNU-NEXT: subi r20, 251
+; GNU-NEXT: sbci r21, 255
+; GNU-NEXT: mov r18, r28
+; GNU-NEXT: mov r19, r29
+; GNU-NEXT: subi r18, 255
+; GNU-NEXT: sbci r19, 255
+; GNU-NEXT: rcall sincosf
+; GNU-NEXT: ldd r18, Y+5
+; GNU-NEXT: ldd r19, Y+6
+; GNU-NEXT: ldd r20, Y+7
+; GNU-NEXT: ldd r21, Y+8
+; GNU-NEXT: ldd r22, Y+1
+; GNU-NEXT: ldd r23, Y+2
+; GNU-NEXT: ldd r24, Y+3
+; GNU-NEXT: ldd r25, Y+4
+; GNU-NEXT: adiw r28, 8
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: pop r29
+; GNU-NEXT: pop r28
+; GNU-NEXT: ret
+ %result = call { float, float } @llvm.sincos.f32(float %a)
+ ret { float, float } %result
+}
+
+define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) #0 {
+; NONGNU-LABEL: test_sincos_v2f32:
+; NONGNU: ; %bb.0:
+; NONGNU-NEXT: push r8
+; NONGNU-NEXT: push r9
+; NONGNU-NEXT: push r10
+; NONGNU-NEXT: push r11
+; NONGNU-NEXT: push r12
+; NONGNU-NEXT: push r13
+; NONGNU-NEXT: push r14
+; NONGNU-NEXT: push r15
+; NONGNU-NEXT: mov r14, r22
+; NONGNU-NEXT: mov r15, r23
+; NONGNU-NEXT: mov r12, r20
+; NONGNU-NEXT: mov r13, r21
+; NONGNU-NEXT: mov r10, r18
+; NONGNU-NEXT: mov r11, r19
+; NONGNU-NEXT: mov r8, r24
+; NONGNU-NEXT: mov r9, r25
+; NONGNU-NEXT: mov r22, r12
+; NONGNU-NEXT: mov r23, r13
+; NONGNU-NEXT: mov r24, r14
+; NONGNU-NEXT: mov r25, r15
+; NONGNU-NEXT: rcall cos
+; NONGNU-NEXT: mov r30, r8
+; NONGNU-NEXT: mov r31, r9
+; NONGNU-NEXT: std Z+15, r25
+; NONGNU-NEXT: std Z+14, r24
+; NONGNU-NEXT: std Z+13, r23
+; NONGNU-NEXT: std Z+12, r22
+; NONGNU-NEXT: mov r22, r16
+; NONGNU-NEXT: mov r23, r17
+; NONGNU-NEXT: mov r24, r10
+; NONGNU-NEXT: mov r25, r11
+; NONGNU-NEXT: rcall cos
+; NONGNU-NEXT: mov r30, r8
+; NONGNU-NEXT: mov r31, r9
+; NONGNU-NEXT: std Z+11, r25
+; NONGNU-NEXT: std Z+10, r24
+; NONGNU-NEXT: std Z+9, r23
+; NONGNU-NEXT: std Z+8, r22
+; NONGNU-NEXT: mov r22, r12
+; NONGNU-NEXT: mov r23, r13
+; NONGNU-NEXT: mov r24, r14
+; NONGNU-NEXT: mov r25, r15
+; NONGNU-NEXT: rcall sin
+; NONGNU-NEXT: mov r30, r8
+; NONGNU-NEXT: mov r31, r9
+; NONGNU-NEXT: std Z+7, r25
+; NONGNU-NEXT: std Z+6, r24
+; NONGNU-NEXT: std Z+5, r23
+; NONGNU-NEXT: std Z+4, r22
+; NONGNU-NEXT: mov r22, r16
+; NONGNU-NEXT: mov r23, r17
+; NONGNU-NEXT: mov r24, r10
+; NONGNU-NEXT: mov r25, r11
+; NONGNU-NEXT: rcall sin
+; NONGNU-NEXT: mov r30, r8
+; NONGNU-NEXT: mov r31, r9
+; NONGNU-NEXT: std Z+3, r25
+; NONGNU-NEXT: std Z+2, r24
+; NONGNU-NEXT: std Z+1, r23
+; NONGNU-NEXT: st Z, r22
+; NONGNU-NEXT: pop r15
+; NONGNU-NEXT: pop r14
+; NONGNU-NEXT: pop r13
+; NONGNU-NEXT: pop r12
+; NONGNU-NEXT: pop r11
+; NONGNU-NEXT: pop r10
+; NONGNU-NEXT: pop r9
+; NONGNU-NEXT: pop r8
+; NONGNU-NEXT: ret
+;
+; GNU-LABEL: test_sincos_v2f32:
+; GNU: ; %bb.0:
+; GNU-NEXT: push r12
+; GNU-NEXT: push r13
+; GNU-NEXT: push r14
+; GNU-NEXT: push r15
+; GNU-NEXT: push r28
+; GNU-NEXT: push r29
+; GNU-NEXT: in r28, 61
+; GNU-NEXT: in r29, 62
+; GNU-NEXT: sbiw r28, 16
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: mov r30, r22
+; GNU-NEXT: mov r31, r23
+; GNU-NEXT: mov r14, r18
+; GNU-NEXT: mov r15, r19
+; GNU-NEXT: mov r12, r24
+; GNU-NEXT: mov r13, r25
+; GNU-NEXT: mov r26, r28
+; GNU-NEXT: mov r27, r29
+; GNU-NEXT: adiw r26, 13
+; GNU-NEXT: mov r18, r28
+; GNU-NEXT: mov r19, r29
+; GNU-NEXT: subi r18, 247
+; GNU-NEXT: sbci r19, 255
+; GNU-NEXT: mov r22, r20
+; GNU-NEXT: mov r23, r21
+; GNU-NEXT: mov r24, r30
+; GNU-NEXT: mov r25, r31
+; GNU-NEXT: mov r20, r26
+; GNU-NEXT: mov r21, r27
+; GNU-NEXT: rcall sincosf
+; GNU-NEXT: mov r20, r28
+; GNU-NEXT: mov r21, r29
+; GNU-NEXT: subi r20, 251
+; GNU-NEXT: sbci r21, 255
+; GNU-NEXT: mov r18, r28
+; GNU-NEXT: mov r19, r29
+; GNU-NEXT: subi r18, 255
+; GNU-NEXT: sbci r19, 255
+; GNU-NEXT: mov r22, r16
+; GNU-NEXT: mov r23, r17
+; GNU-NEXT: mov r24, r14
+; GNU-NEXT: mov r25, r15
+; GNU-NEXT: rcall sincosf
+; GNU-NEXT: ldd r24, Y+11
+; GNU-NEXT: ldd r25, Y+12
+; GNU-NEXT: mov r30, r12
+; GNU-NEXT: mov r31, r13
+; GNU-NEXT: std Z+15, r25
+; GNU-NEXT: std Z+14, r24
+; GNU-NEXT: ldd r24, Y+9
+; GNU-NEXT: ldd r25, Y+10
+; GNU-NEXT: std Z+13, r25
+; GNU-NEXT: std Z+12, r24
+; GNU-NEXT: ldd r24, Y+3
+; GNU-NEXT: ldd r25, Y+4
+; GNU-NEXT: std Z+11, r25
+; GNU-NEXT: std Z+10, r24
+; GNU-NEXT: ldd r24, Y+1
+; GNU-NEXT: ldd r25, Y+2
+; GNU-NEXT: std Z+9, r25
+; GNU-NEXT: std Z+8, r24
+; GNU-NEXT: ldd r24, Y+15
+; GNU-NEXT: ldd r25, Y+16
+; GNU-NEXT: std Z+7, r25
+; GNU-NEXT: std Z+6, r24
+; GNU-NEXT: ldd r24, Y+13
+; GNU-NEXT: ldd r25, Y+14
+; GNU-NEXT: std Z+5, r25
+; GNU-NEXT: std Z+4, r24
+; GNU-NEXT: ldd r24, Y+7
+; GNU-NEXT: ldd r25, Y+8
+; GNU-NEXT: std Z+3, r25
+; GNU-NEXT: std Z+2, r24
+; GNU-NEXT: ldd r24, Y+5
+; GNU-NEXT: ldd r25, Y+6
+; GNU-NEXT: std Z+1, r25
+; GNU-NEXT: st Z, r24
+; GNU-NEXT: adiw r28, 16
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: pop r29
+; GNU-NEXT: pop r28
+; GNU-NEXT: pop r15
+; GNU-NEXT: pop r14
+; GNU-NEXT: pop r13
+; GNU-NEXT: pop r12
+; GNU-NEXT: ret
+ %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a)
+ ret { <2 x float>, <2 x float> } %result
+}
+
+; FIXME: Broken
+; define { double, double } @test_sincos_f64(double %a) #0 {
+; %result = call { double, double } @llvm.sincos.f64(double %a)
+; ret { double, double } %result
+; }
+
+; FIXME: Broken
+; define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) #0 {
+; %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a)
+; ret { <2 x double>, <2 x double> } %result
+; }
+
+define { fp128, fp128 } @test_sincos_f128(fp128 %a) #0 {
+; NONGNU-LABEL: test_sincos_f128:
+; NONGNU: ; %bb.0:
+; NONGNU-NEXT: push r2
+; NONGNU-NEXT: push r3
+; NONGNU-NEXT: push r4
+; NONGNU-NEXT: push r5
+; NONGNU-NEXT: push r6
+; NONGNU-NEXT: push r7
+; NONGNU-NEXT: push r28
+; NONGNU-NEXT: push r29
+; NONGNU-NEXT: in r28, 61
+; NONGNU-NEXT: in r29, 62
+; NONGNU-NEXT: sbiw r28, 34
+; NONGNU-NEXT: in r0, 63
+; NONGNU-NEXT: cli
+; NONGNU-NEXT: out 62, r29
+; NONGNU-NEXT: out 63, r0
+; NONGNU-NEXT: out 61, r28
+; NONGNU-NEXT: std Y+2, r23 ; 2-byte Folded Spill
+; NONGNU-NEXT: std Y+1, r22 ; 2-byte Folded Spill
+; NONGNU-NEXT: mov r2, r20
+; NONGNU-NEXT: mov r3, r21
+; NONGNU-NEXT: mov r4, r18
+; NONGNU-NEXT: mov r5, r19
+; NONGNU-NEXT: mov r6, r24
+; NONGNU-NEXT: mov r7, r25
+; NONGNU-NEXT: mov r24, r28
+; NONGNU-NEXT: mov r25, r29
+; NONGNU-NEXT: adiw r24, 3
+; NONGNU-NEXT: rcall cosl
+; NONGNU-NEXT: mov r24, r28
+; NONGNU-NEXT: mov r25, r29
+; NONGNU-NEXT: adiw r24, 19
+; NONGNU-NEXT: mov r18, r4
+; NONGNU-NEXT: mov r19, r5
+; NONGNU-NEXT: mov r20, r2
+; NONGNU-NEXT: mov r21, r3
+; NONGNU-NEXT: ldd r22, Y+1 ; 2-byte Folded Reload
+; NONGNU-NEXT: ldd r23, Y+2 ; 2-byte Folded Reload
+; NONGNU-NEXT: rcall sinl
+; NONGNU-NEXT: ldd r24, Y+17
+; NONGNU-NEXT: ldd r25, Y+18
+; NONGNU-NEXT: mov r30, r6
+; NONGNU-NEXT: mov r31, r7
+; NONGNU-NEXT: std Z+31, r25
+; NONGNU-NEXT: std Z+30, r24
+; NONGNU-NEXT: ldd r24, Y+15
+; NONGNU-NEXT: ldd r25, Y+16
+; NONGNU-NEXT: std Z+29, r25
+; NONGNU-NEXT: std Z+28, r24
+; NONGNU-NEXT: ldd r24, Y+13
+; NONGNU-NEXT: ldd r25, Y+14
+; NONGNU-NEXT: std Z+27, r25
+; NONGNU-NEXT: std Z+26, r24
+; NONGNU-NEXT: ldd r24, Y+11
+; NONGNU-NEXT: ldd r25, Y+12
+; NONGNU-NEXT: std Z+25, r25
+; NONGNU-NEXT: std Z+24, r24
+; NONGNU-NEXT: ldd r24, Y+9
+; NONGNU-NEXT: ldd r25, Y+10
+; NONGNU-NEXT: std Z+23, r25
+; NONGNU-NEXT: std Z+22, r24
+; NONGNU-NEXT: ldd r24, Y+7
+; NONGNU-NEXT: ldd r25, Y+8
+; NONGNU-NEXT: std Z+21, r25
+; NONGNU-NEXT: std Z+20, r24
+; NONGNU-NEXT: ldd r24, Y+5
+; NONGNU-NEXT: ldd r25, Y+6
+; NONGNU-NEXT: std Z+19, r25
+; NONGNU-NEXT: std Z+18, r24
+; NONGNU-NEXT: ldd r24, Y+3
+; NONGNU-NEXT: ldd r25, Y+4
+; NONGNU-NEXT: std Z+17, r25
+; NONGNU-NEXT: std Z+16, r24
+; NONGNU-NEXT: ldd r24, Y+33
+; NONGNU-NEXT: ldd r25, Y+34
+; NONGNU-NEXT: std Z+15, r25
+; NONGNU-NEXT: std Z+14, r24
+; NONGNU-NEXT: ldd r24, Y+31
+; NONGNU-NEXT: ldd r25, Y+32
+; NONGNU-NEXT: std Z+13, r25
+; NONGNU-NEXT: std Z+12, r24
+; NONGNU-NEXT: ldd r24, Y+29
+; NONGNU-NEXT: ldd r25, Y+30
+; NONGNU-NEXT: std Z+11, r25
+; NONGNU-NEXT: std Z+10, r24
+; NONGNU-NEXT: ldd r24, Y+27
+; NONGNU-NEXT: ldd r25, Y+28
+; NONGNU-NEXT: std Z+9, r25
+; NONGNU-NEXT: std Z+8, r24
+; NONGNU-NEXT: ldd r24, Y+25
+; NONGNU-NEXT: ldd r25, Y+26
+; NONGNU-NEXT: std Z+7, r25
+; NONGNU-NEXT: std Z+6, r24
+; NONGNU-NEXT: ldd r24, Y+23
+; NONGNU-NEXT: ldd r25, Y+24
+; NONGNU-NEXT: std Z+5, r25
+; NONGNU-NEXT: std Z+4, r24
+; NONGNU-NEXT: ldd r24, Y+21
+; NONGNU-NEXT: ldd r25, Y+22
+; NONGNU-NEXT: std Z+3, r25
+; NONGNU-NEXT: std Z+2, r24
+; NONGNU-NEXT: ldd r24, Y+19
+; NONGNU-NEXT: ldd r25, Y+20
+; NONGNU-NEXT: std Z+1, r25
+; NONGNU-NEXT: st Z, r24
+; NONGNU-NEXT: adiw r28, 34
+; NONGNU-NEXT: in r0, 63
+; NONGNU-NEXT: cli
+; NONGNU-NEXT: out 62, r29
+; NONGNU-NEXT: out 63, r0
+; NONGNU-NEXT: out 61, r28
+; NONGNU-NEXT: pop r29
+; NONGNU-NEXT: pop r28
+; NONGNU-NEXT: pop r7
+; NONGNU-NEXT: pop r6
+; NONGNU-NEXT: pop r5
+; NONGNU-NEXT: pop r4
+; NONGNU-NEXT: pop r3
+; NONGNU-NEXT: pop r2
+; NONGNU-NEXT: ret
+;
+; GNU-LABEL: test_sincos_f128:
+; GNU: ; %bb.0:
+; GNU-NEXT: push r6
+; GNU-NEXT: push r7
+; GNU-NEXT: push r28
+; GNU-NEXT: push r29
+; GNU-NEXT: in r28, 61
+; GNU-NEXT: in r29, 62
+; GNU-NEXT: sbiw r28, 52
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: mov r6, r24
+; GNU-NEXT: mov r7, r25
+; GNU-NEXT: mov r24, r28
+; GNU-NEXT: mov r25, r29
+; GNU-NEXT: adiw r24, 21
+; GNU-NEXT: std Y+4, r25
+; GNU-NEXT: std Y+3, r24
+; GNU-NEXT: mov r24, r28
+; GNU-NEXT: mov r25, r29
+; GNU-NEXT: adiw r24, 37
+; GNU-NEXT: std Y+2, r25
+; GNU-NEXT: std Y+1, r24
+; GNU-NEXT: mov r24, r28
+; GNU-NEXT: mov r25, r29
+; GNU-NEXT: adiw r24, 5
+; GNU-NEXT: rcall sincosl
+; GNU-NEXT: ldd r24, Y+35
+; GNU-NEXT: ldd r25, Y+36
+; GNU-NEXT: mov r30, r6
+; GNU-NEXT: mov r31, r7
+; GNU-NEXT: std Z+31, r25
+; GNU-NEXT: std Z+30, r24
+; GNU-NEXT: ldd r24, Y+33
+; GNU-NEXT: ldd r25, Y+34
+; GNU-NEXT: std Z+29, r25
+; GNU-NEXT: std Z+28, r24
+; GNU-NEXT: ldd r24, Y+31
+; GNU-NEXT: ldd r25, Y+32
+; GNU-NEXT: std Z+27, r25
+; GNU-NEXT: std Z+26, r24
+; GNU-NEXT: ldd r24, Y+29
+; GNU-NEXT: ldd r25, Y+30
+; GNU-NEXT: std Z+25, r25
+; GNU-NEXT: std Z+24, r24
+; GNU-NEXT: ldd r24, Y+27
+; GNU-NEXT: ldd r25, Y+28
+; GNU-NEXT: std Z+23, r25
+; GNU-NEXT: std Z+22, r24
+; GNU-NEXT: ldd r24, Y+25
+; GNU-NEXT: ldd r25, Y+26
+; GNU-NEXT: std Z+21, r25
+; GNU-NEXT: std Z+20, r24
+; GNU-NEXT: ldd r24, Y+23
+; GNU-NEXT: ldd r25, Y+24
+; GNU-NEXT: std Z+19, r25
+; GNU-NEXT: std Z+18, r24
+; GNU-NEXT: ldd r24, Y+21
+; GNU-NEXT: ldd r25, Y+22
+; GNU-NEXT: std Z+17, r25
+; GNU-NEXT: std Z+16, r24
+; GNU-NEXT: ldd r24, Y+51
+; GNU-NEXT: ldd r25, Y+52
+; GNU-NEXT: std Z+15, r25
+; GNU-NEXT: std Z+14, r24
+; GNU-NEXT: ldd r24, Y+49
+; GNU-NEXT: ldd r25, Y+50
+; GNU-NEXT: std Z+13, r25
+; GNU-NEXT: std Z+12, r24
+; GNU-NEXT: ldd r24, Y+47
+; GNU-NEXT: ldd r25, Y+48
+; GNU-NEXT: std Z+11, r25
+; GNU-NEXT: std Z+10, r24
+; GNU-NEXT: ldd r24, Y+45
+; GNU-NEXT: ldd r25, Y+46
+; GNU-NEXT: std Z+9, r25
+; GNU-NEXT: std Z+8, r24
+; GNU-NEXT: ldd r24, Y+43
+; GNU-NEXT: ldd r25, Y+44
+; GNU-NEXT: std Z+7, r25
+; GNU-NEXT: std Z+6, r24
+; GNU-NEXT: ldd r24, Y+41
+; GNU-NEXT: ldd r25, Y+42
+; GNU-NEXT: std Z+5, r25
+; GNU-NEXT: std Z+4, r24
+; GNU-NEXT: ldd r24, Y+39
+; GNU-NEXT: ldd r25, Y+40
+; GNU-NEXT: std Z+3, r25
+; GNU-NEXT: std Z+2, r24
+; GNU-NEXT: ldd r24, Y+37
+; GNU-NEXT: ldd r25, Y+38
+; GNU-NEXT: std Z+1, r25
+; GNU-NEXT: st Z, r24
+; GNU-NEXT: adiw r28, 52
+; GNU-NEXT: in r0, 63
+; GNU-NEXT: cli
+; GNU-NEXT: out 62, r29
+; GNU-NEXT: out 63, r0
+; GNU-NEXT: out 61, r28
+; GNU-NEXT: pop r29
+; GNU-NEXT: pop r28
+; GNU-NEXT: pop r7
+; GNU-NEXT: pop r6
+; GNU-NEXT: ret
+ %result = call { fp128, fp128 } @llvm.sincos.f128(fp128 %a)
+ ret { fp128, fp128 } %result
+}
+
+attributes #0 = { nounwind }
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/rawbuffer-doubles.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/rawbuffer-doubles.ll
new file mode 100644
index 0000000..5e44b93
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/rawbuffer-doubles.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S --passes="print-dx-shader-flags" 2>&1 %s | FileCheck %s
+; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC
+
+target triple = "dxil-pc-shadermodel6.7-library"
+
+; CHECK: ; Combined Shader Flags for Module
+; CHECK-NEXT: ; Shader Flags Value: 0x00000014
+; CHECK-NEXT: ;
+; CHECK-NEXT: ; Note: shader requires additional functionality:
+; CHECK-NEXT: ; Double-precision floating point
+; CHECK-NEXT: ; Note: extra DXIL module flags:
+; CHECK-NEXT: ; Raw and structured buffers
+; CHECK-NEXT: ;
+; CHECK-NEXT: ; Shader Flags for Module Functions
+
+; CHECK: Function rawbuf : 0x00000014
+define void @rawbuf() "hlsl.export" {
+ %rb = tail call target("dx.RawBuffer", <4 x double>, 0, 0)
+ @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_v4f16_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+ %load = call { <4 x double>, i1 }
+ @llvm.dx.resource.load.rawbuffer.v4double.tdx.RawBuffer_v4f16_0_0t(target("dx.RawBuffer", <4 x double>, 0, 0) %rb, i32 0, i32 0)
+ %extract = extractvalue { <4 x double>, i1 } %load, 0
+ ret void
+}
+
+; Metadata to avoid adding flags not currently of interest to this test
+!dx.valver = !{!0}
+!0 = !{i32 1, i32 8}
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"dx.resmayalias", i32 1}
+
+; DXC: - Name: SFI0
+; DXC-NEXT: Size: 8
+; DXC-NEXT: Flags:
+; DXC-NEXT: Doubles: true
+; DXC: ...
+
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/rawbuffer-int64.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/rawbuffer-int64.ll
new file mode 100644
index 0000000..517147a
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/rawbuffer-int64.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S --passes="print-dx-shader-flags" 2>&1 %s | FileCheck %s
+; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC
+
+target triple = "dxil-pc-shadermodel6.7-library"
+
+; CHECK: ; Combined Shader Flags for Module
+; CHECK-NEXT: ; Shader Flags Value: 0x00100010
+; CHECK-NEXT: ;
+; CHECK-NEXT: ; Note: shader requires additional functionality:
+; CHECK-NEXT: ; 64-Bit integer
+; CHECK-NEXT: ; Note: extra DXIL module flags:
+; CHECK-NEXT: ; Raw and structured buffers
+; CHECK-NEXT: ;
+; CHECK-NEXT: ; Shader Flags for Module Functions
+
+; CHECK: Function rawbuf : 0x00100010
+define void @rawbuf() "hlsl.export" {
+ %rb = tail call target("dx.RawBuffer", <4 x i64>, 0, 0)
+ @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_v4f16_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+ %load = call { <4 x i64>, i1 }
+ @llvm.dx.resource.load.rawbuffer.v4i64.tdx.RawBuffer_v4f16_0_0t(target("dx.RawBuffer", <4 x i64>, 0, 0) %rb, i32 0, i32 0)
+ %extract = extractvalue { <4 x i64>, i1 } %load, 0
+ ret void
+}
+
+; Metadata to avoid adding flags not currently of interest to this test
+!dx.valver = !{!0}
+!0 = !{i32 1, i32 8}
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"dx.resmayalias", i32 1}
+
+; DXC: - Name: SFI0
+; DXC-NEXT: Size: 8
+; DXC-NEXT: Flags:
+; DXC: Int64Ops: true
+; DXC: ...
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/rawbuffer-low-precision.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/rawbuffer-low-precision.ll
new file mode 100644
index 0000000..cb4a3e9
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/rawbuffer-low-precision.ll
@@ -0,0 +1,44 @@
+; RUN: opt -S --passes="print-dx-shader-flags" 2>&1 %s | FileCheck %s
+; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC
+
+target triple = "dxil-pc-shadermodel6.7-library"
+
+; CHECK: ; Combined Shader Flags for Module
+; CHECK-NEXT: ; Shader Flags Value: 0x00800030
+; CHECK-NEXT: ;
+; CHECK-NEXT: ; Note: shader requires additional functionality:
+; CHECK-NEXT: ; Native low-precision data types
+; CHECK-NEXT: ; Note: extra DXIL module flags:
+; CHECK-NEXT: ; Raw and structured buffers
+; CHECK-NEXT: ; Low-precision data types present
+; CHECK-NEXT: ; Enable native low-precision data types
+; CHECK-NEXT: ;
+; CHECK-NEXT: ; Shader Flags for Module Functions
+
+; CHECK: Function rawbuf : 0x00800030
+define void @rawbuf() "hlsl.export" {
+ %halfrb = tail call target("dx.RawBuffer", <4 x half>, 0, 0)
+ @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_v4f16_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+ %i16rb = tail call target("dx.RawBuffer", <4 x i16>, 1, 0)
+ @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_v4i16_1_0t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr null)
+ %loadhalfrb = call { <4 x i16>, i1 }
+ @llvm.dx.resource.load.rawbuffer.v4i16.tdx.RawBuffer_v4f16_0_0t(target("dx.RawBuffer", <4 x half>, 0, 0) %halfrb, i32 0, i32 0)
+ %extracti16vec = extractvalue { <4 x i16>, i1 } %loadhalfrb, 0
+ call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_v4i16_1_0t.v4i16(target("dx.RawBuffer", <4 x i16>, 1, 0) %i16rb, i32 0, i32 0, <4 x i16> %extracti16vec)
+ ret void
+}
+
+; Metadata to avoid adding flags not currently of interest to this test, and
+; enable native low precision data types
+!dx.valver = !{!0}
+!0 = !{i32 1, i32 8}
+!llvm.module.flags = !{!1, !2}
+!1 = !{i32 1, !"dx.nativelowprec", i32 1}
+!2 = !{i32 1, !"dx.resmayalias", i32 1}
+
+; DXC: - Name: SFI0
+; DXC-NEXT: Size: 8
+; DXC-NEXT: Flags:
+; DXC: MinimumPrecision: false
+; DXC: NativeLowPrecision: true
+; DXC: ...
diff --git a/llvm/test/CodeGen/DirectX/UAddc.ll b/llvm/test/CodeGen/DirectX/UAddc.ll
index 4b46b56..dd7aa23 100644
--- a/llvm/test/CodeGen/DirectX/UAddc.ll
+++ b/llvm/test/CodeGen/DirectX/UAddc.ll
@@ -35,14 +35,10 @@ define noundef <2 x i32> @test_UAddc_vec2(<2 x i32> noundef %a, <2 x i32> nounde
; CHECK-NEXT: [[UADDC_I1:%.*]] = call [[DX_TYPES_I32C]] @dx.op.binaryWithCarryOrBorrow.i32(i32 44, i32 [[A_I1]], i32 [[B_I1]]) #[[ATTR0]]
; CHECK-NEXT: [[CARRY_ELEM0:%.*]] = extractvalue [[DX_TYPES_I32C]] [[UADDC_I0]], 1
; CHECK-NEXT: [[CARRY_ELEM1:%.*]] = extractvalue [[DX_TYPES_I32C]] [[UADDC_I1]], 1
-; CHECK-NEXT: [[CARRY_UPTO0:%.*]] = insertelement <2 x i1> poison, i1 [[CARRY_ELEM0]], i64 0
-; CHECK-NEXT: [[CARRY:%.*]] = insertelement <2 x i1> [[CARRY_UPTO0]], i1 [[CARRY_ELEM1]], i64 1
-; CHECK-NEXT: [[CARRY_I0:%.*]] = extractelement <2 x i1> [[CARRY]], i64 0
-; CHECK-NEXT: [[CARRY_I1:%.*]] = extractelement <2 x i1> [[CARRY]], i64 1
; CHECK-NEXT: [[SUM_ELEM0:%.*]] = extractvalue [[DX_TYPES_I32C]] [[UADDC_I0]], 0
; CHECK-NEXT: [[SUM_ELEM1:%.*]] = extractvalue [[DX_TYPES_I32C]] [[UADDC_I1]], 0
-; CHECK-NEXT: [[CARRY_ZEXT_I0:%.*]] = zext i1 [[CARRY_I0]] to i32
-; CHECK-NEXT: [[CARRY_ZEXT_I1:%.*]] = zext i1 [[CARRY_I1]] to i32
+; CHECK-NEXT: [[CARRY_ZEXT_I0:%.*]] = zext i1 [[CARRY_ELEM0]] to i32
+; CHECK-NEXT: [[CARRY_ZEXT_I1:%.*]] = zext i1 [[CARRY_ELEM1]] to i32
; CHECK-NEXT: [[RESULT_I0:%.*]] = add i32 [[SUM_ELEM0]], [[CARRY_ZEXT_I0]]
; CHECK-NEXT: [[RESULT_I1:%.*]] = add i32 [[SUM_ELEM1]], [[CARRY_ZEXT_I1]]
; CHECK-NEXT: [[RESULT_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[RESULT_I0]], i64 0
diff --git a/llvm/test/CodeGen/DirectX/bugfix_150050_data_scalarize_const_gep.ll b/llvm/test/CodeGen/DirectX/bugfix_150050_data_scalarize_const_gep.ll
new file mode 100644
index 0000000..156a8e7
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/bugfix_150050_data_scalarize_const_gep.ll
@@ -0,0 +1,80 @@
+; RUN: opt -S -passes='dxil-data-scalarization' -mtriple=dxil-pc-shadermodel6.4-library %s | FileCheck %s --check-prefixes=SCHECK,CHECK
+; RUN: opt -S -passes='dxil-data-scalarization,function(scalarizer<load-store>),dxil-flatten-arrays' -mtriple=dxil-pc-shadermodel6.4-library %s | FileCheck %s --check-prefixes=FCHECK,CHECK
+
+@aTile = hidden addrspace(3) global [10 x [10 x <4 x i32>]] zeroinitializer, align 16
+@bTile = hidden addrspace(3) global [10 x [10 x i32]] zeroinitializer, align 16
+@cTile = internal global [2 x [2 x <2 x i32>]] zeroinitializer, align 16
+@dTile = internal global [2 x [2 x [2 x <2 x i32>]]] zeroinitializer, align 16
+
+define void @CSMain() {
+; CHECK-LABEL: define void @CSMain() {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[AFRAGPACKED_I_SCALARIZE:%.*]] = alloca [4 x i32], align 16
+;
+; SCHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [10 x <4 x i32>], ptr addrspace(3) getelementptr inbounds ([10 x [10 x [4 x i32]]], ptr addrspace(3) @aTile.scalarized, i32 0, i32 1), i32 0, i32 2
+; SCHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP0]], align 16
+; SCHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[AFRAGPACKED_I_SCALARIZE]], align 16
+;
+; FCHECK-NEXT: [[AFRAGPACKED_I_SCALARIZE_I14:%.*]] = getelementptr [4 x i32], ptr [[AFRAGPACKED_I_SCALARIZE]], i32 0, i32 1
+; FCHECK-NEXT: [[AFRAGPACKED_I_SCALARIZE_I25:%.*]] = getelementptr [4 x i32], ptr [[AFRAGPACKED_I_SCALARIZE]], i32 0, i32 2
+; FCHECK-NEXT: [[AFRAGPACKED_I_SCALARIZE_I36:%.*]] = getelementptr [4 x i32], ptr [[AFRAGPACKED_I_SCALARIZE]], i32 0, i32 3
+; FCHECK-NEXT: [[DOTI07:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([400 x i32], ptr addrspace(3) @aTile.scalarized.1dim, i32 0, i32 48), align 16
+; FCHECK-NEXT: [[DOTI119:%.*]] = load i32, ptr addrspace(3) getelementptr ([400 x i32], ptr addrspace(3) @aTile.scalarized.1dim, i32 0, i32 49), align 4
+; FCHECK-NEXT: [[DOTI2211:%.*]] = load i32, ptr addrspace(3) getelementptr ([400 x i32], ptr addrspace(3) @aTile.scalarized.1dim, i32 0, i32 50), align 8
+; FCHECK-NEXT: [[DOTI3313:%.*]] = load i32, ptr addrspace(3) getelementptr ([400 x i32], ptr addrspace(3) @aTile.scalarized.1dim, i32 0, i32 51), align 4
+; FCHECK-NEXT: store i32 [[DOTI07]], ptr [[AFRAGPACKED_I_SCALARIZE]], align 16
+; FCHECK-NEXT: store i32 [[DOTI119]], ptr [[AFRAGPACKED_I_SCALARIZE_I14]], align 4
+; FCHECK-NEXT: store i32 [[DOTI2211]], ptr [[AFRAGPACKED_I_SCALARIZE_I25]], align 8
+; FCHECK-NEXT: store i32 [[DOTI3313]], ptr [[AFRAGPACKED_I_SCALARIZE_I36]], align 4
+;
+; CHECK-NEXT: ret void
+entry:
+ %aFragPacked.i = alloca <4 x i32>, align 16
+ %0 = load <4 x i32>, ptr addrspace(3) getelementptr inbounds ([10 x <4 x i32>], ptr addrspace(3) getelementptr inbounds ([10 x [10 x <4 x i32>]], ptr addrspace(3) @aTile, i32 0, i32 1), i32 0, i32 2), align 16
+ store <4 x i32> %0, ptr %aFragPacked.i, align 16
+ ret void
+}
+
+define void @Main() {
+; CHECK-LABEL: define void @Main() {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[BFRAGPACKED_I:%.*]] = alloca i32, align 16
+;
+; SCHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [10 x i32], ptr addrspace(3) getelementptr inbounds ([10 x [10 x i32]], ptr addrspace(3) @bTile, i32 0, i32 1), i32 0, i32 1
+; SCHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[TMP0]], align 16
+; SCHECK-NEXT: store i32 [[TMP1]], ptr [[BFRAGPACKED_I]], align 16
+;
+; FCHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([100 x i32], ptr addrspace(3) @bTile.1dim, i32 0, i32 11), align 16
+; FCHECK-NEXT: store i32 [[TMP0]], ptr [[BFRAGPACKED_I]], align 16
+;
+; CHECK-NEXT: ret void
+entry:
+ %bFragPacked.i = alloca i32, align 16
+ %0 = load i32, ptr addrspace(3) getelementptr inbounds ([10 x i32], ptr addrspace(3) getelementptr inbounds ([10 x [10 x i32]], ptr addrspace(3) @bTile, i32 0, i32 1), i32 0, i32 1), align 16
+ store i32 %0, ptr %bFragPacked.i, align 16
+ ret void
+}
+
+define void @global_nested_geps_3d() {
+; CHECK-LABEL: define void @global_nested_geps_3d() {
+; SCHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr getelementptr inbounds ([2 x <2 x i32>], ptr getelementptr inbounds ([2 x [2 x [2 x i32]]], ptr @cTile.scalarized, i32 0, i32 1), i32 0, i32 1), i32 0, i32 1
+; SCHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+;
+; FCHECK-NEXT: [[TMP1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @cTile.scalarized.1dim, i32 0, i32 7), align 4
+;
+; CHECK-NEXT: ret void
+ %1 = load i32, i32* getelementptr inbounds (<2 x i32>, <2 x i32>* getelementptr inbounds ([2 x <2 x i32>], [2 x <2 x i32>]* getelementptr inbounds ([2 x [2 x <2 x i32>]], [2 x [2 x <2 x i32>]]* @cTile, i32 0, i32 1), i32 0, i32 1), i32 0, i32 1), align 4
+ ret void
+}
+
+define void @global_nested_geps_4d() {
+; CHECK-LABEL: define void @global_nested_geps_4d() {
+; SCHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr getelementptr inbounds ([2 x <2 x i32>], ptr getelementptr inbounds ([2 x [2 x <2 x i32>]], ptr getelementptr inbounds ([2 x [2 x [2 x [2 x i32]]]], ptr @dTile.scalarized, i32 0, i32 1), i32 0, i32 1), i32 0, i32 1), i32 0, i32 1
+; SCHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+;
+; FCHECK-NEXT: [[TMP1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @dTile.scalarized.1dim, i32 0, i32 15), align 4
+;
+; CHECK-NEXT: ret void
+ %1 = load i32, i32* getelementptr inbounds (<2 x i32>, <2 x i32>* getelementptr inbounds ([2 x <2 x i32>], [2 x <2 x i32>]* getelementptr inbounds ([2 x [2 x <2 x i32>]], [2 x [2 x <2 x i32>]]* getelementptr inbounds ([2 x [2 x [2 x <2 x i32>]]], [2 x [2 x [2 x <2 x i32>]]]* @dTile, i32 0, i32 1), i32 0, i32 1), i32 0, i32 1), i32 0, i32 1), align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/issue-145408-gep-struct-fix.ll b/llvm/test/CodeGen/DirectX/issue-145408-gep-struct-fix.ll
index 40d222c..e6d4c1e 100644
--- a/llvm/test/CodeGen/DirectX/issue-145408-gep-struct-fix.ll
+++ b/llvm/test/CodeGen/DirectX/issue-145408-gep-struct-fix.ll
@@ -8,10 +8,12 @@ define void @test_no_transform_of_struct() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[OUTPUTSIZESLOCAL_I:%.*]] = alloca [[STRUCT_RAWSTRUCT8D:%.*]], align 4
; CHECK-NEXT: [[ARRAYINIT_ELEMENT13_I76:%.*]] = getelementptr inbounds nuw [1 x %struct.RawStruct8D], ptr [[OUTPUTSIZESLOCAL_I]], i32 0, i32 0
+; CHECK-NEXT: [[ARRAYINIT_ELEMENT13_I76_I1:%.*]] = getelementptr inbounds nuw [8 x i32], ptr [[ARRAYINIT_ELEMENT13_I76]], i32 0, i32 1
; CHECK-NEXT: ret void
;
entry:
%outputSizesLocal.i = alloca %struct.RawStruct8D, align 4
%arrayinit.element13.i76 = getelementptr inbounds nuw [1 x %struct.RawStruct8D], ptr %outputSizesLocal.i, i32 0, i32 0
+ %arrayinit.element13.i76.i1 = getelementptr inbounds nuw [8 x i32], ptr %arrayinit.element13.i76, i32 0, i32 1
ret void
}
diff --git a/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll b/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll
index f77df2d..77133eb 100644
--- a/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll
+++ b/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll
@@ -1,30 +1,27 @@
; RUN: opt -S -passes='dxil-op-lower' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-SM63
; RUN: opt -S -passes='dxil-op-lower' -mtriple=dxil-pc-shadermodel6.6-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-SM66
-; RUN: opt -S -dxil-op-lower -dxil-prepare -mtriple=dxil-pc-shadermodel6.6-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-PREPARE
+; RUN: opt -S -dxil-prepare -dxil-embed -mtriple=dxil-pc-shadermodel6.6-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-EMBED
+
+; Lifetime intrinsics are not valid prior to shader model 6.6 and are instead
+; replaced with undef stores, provided the validator version is 1.6 or greater
+
+; The dxil-embed pass will remove lifetime intrinsics because they transformed
+; in a way that is illegal in modern LLVM IR before serializing to DXIL bitcode.
+; So we check that no bitcast or lifetime intrinsics remain after dxil-embed
; CHECK-LABEL: define void @test_legal_lifetime() {
-;
-; CHECK-SM63-NEXT: [[ACCUM_I_FLAT:%.*]] = alloca [1 x i32], align 4
-; CHECK-SM63-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[ACCUM_I_FLAT]], i32 0
-; CHECK-SM63-NEXT: store [1 x i32] undef, ptr [[ACCUM_I_FLAT]], align 4
-; CHECK-SM63-NEXT: store i32 0, ptr [[GEP]], align 4
-; CHECK-SM63-NEXT: store [1 x i32] undef, ptr [[ACCUM_I_FLAT]], align 4
-;
-; CHECK-SM66-NEXT: [[ACCUM_I_FLAT:%.*]] = alloca [1 x i32], align 4
-; CHECK-SM66-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[ACCUM_I_FLAT]], i32 0
-; CHECK-SM66-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
-; CHECK-SM66-NEXT: store i32 0, ptr [[GEP]], align 4
-; CHECK-SM66-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
-;
-; CHECK-PREPARE-NEXT: [[ACCUM_I_FLAT:%.*]] = alloca [1 x i32], align 4
-; CHECK-PREPARE-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[ACCUM_I_FLAT]], i32 0
-; CHECK-PREPARE-NEXT: [[BITCAST:%.*]] = bitcast ptr [[ACCUM_I_FLAT]] to ptr
-; CHECK-PREPARE-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[BITCAST]])
-; CHECK-PREPARE-NEXT: store i32 0, ptr [[GEP]], align 4
-; CHECK-PREPARE-NEXT: [[BITCAST:%.*]] = bitcast ptr [[ACCUM_I_FLAT]] to ptr
-; CHECK-PREPARE-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[BITCAST]])
-;
-; CHECK-NEXT: ret void
+; CHECK-NEXT: [[ACCUM_I_FLAT:%.*]] = alloca [1 x i32], align 4
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[ACCUM_I_FLAT]], i32 0
+; CHECK-SM63-NEXT: store [1 x i32] undef, ptr [[ACCUM_I_FLAT]], align 4
+; CHECK-SM66-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
+; CHECK-EMBED-NOT: bitcast
+; CHECK-EMBED-NOT: lifetime
+; CHECK-NEXT: store i32 0, ptr [[GEP]], align 4
+; CHECK-SM63-NEXT: store [1 x i32] undef, ptr [[ACCUM_I_FLAT]], align 4
+; CHECK-SM66-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
+; CHECK-EMBED-NOT: bitcast
+; CHECK-EMBED-NOT: lifetime
+; CHECK-NEXT: ret void
;
define void @test_legal_lifetime() {
%accum.i.flat = alloca [1 x i32], align 4
@@ -35,22 +32,6 @@ define void @test_legal_lifetime() {
ret void
}
-; CHECK-PREPARE-DAG: attributes [[LIFETIME_ATTRS:#.*]] = { nounwind }
-
-; CHECK-PREPARE-DAG: ; Function Attrs: nounwind
-; CHECK-PREPARE-DAG: declare void @llvm.lifetime.start.p0(i64, ptr) [[LIFETIME_ATTRS]]
-
-; CHECK-PREPARE-DAG: ; Function Attrs: nounwind
-; CHECK-PREPARE-DAG: declare void @llvm.lifetime.end.p0(i64, ptr) [[LIFETIME_ATTRS]]
-
-; Function Attrs: nounwind memory(argmem: readwrite)
-declare void @llvm.lifetime.end.p0(i64, ptr) #0
-
-; Function Attrs: nounwind memory(argmem: readwrite)
-declare void @llvm.lifetime.start.p0(i64, ptr) #0
-
-attributes #0 = { nounwind memory(argmem: readwrite) }
-
; Set the validator version to 1.6
!dx.valver = !{!0}
!0 = !{i32 1, i32 6}
diff --git a/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir b/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir
new file mode 100644
index 0000000..2960343
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir
@@ -0,0 +1,50 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner %s -o /dev/null
+
+# Check that edges that violate topological order are not added to the
+# SwingSchedulerDAG. This is a case where the crash was caused by PR 145878.
+
+--- |
+ target triple = "hexagon"
+
+ define void @crash_145878() {
+ entry:
+ br label %loop
+
+ loop: ; preds = %loop, %entry
+ %lsr.iv2 = phi i32 [ %lsr.iv.next, %loop ], [ 1, %entry ]
+ %lsr.iv = phi ptr [ %cgep3, %loop ], [ inttoptr (i32 -8 to ptr), %entry ]
+ %cgep = getelementptr i8, ptr %lsr.iv, i32 12
+ %load = load i32, ptr %cgep, align 4
+ store i32 %load, ptr %lsr.iv, align 4
+ %lsr.iv.next = add nsw i32 %lsr.iv2, -1
+ %iv.cmp.not = icmp eq i32 %lsr.iv.next, 0
+ %cgep3 = getelementptr i8, ptr %lsr.iv, i32 -8
+ br i1 %iv.cmp.not, label %exit, label %loop
+
+ exit: ; preds = %loop
+ ret void
+ }
+...
+---
+name: crash_145878
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x80000000)
+
+ %5:intregs = A2_tfrsi -8
+ J2_loop0i %bb.1, 1, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+
+ bb.1.loop (machine-block-address-taken):
+ successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+ %1:intregs = PHI %5, %bb.0, %3, %bb.1
+ %6:intregs = L2_loadri_io %1, 12 :: (load (s32) from %ir.cgep)
+ S2_storeri_io %1, 0, killed %6 :: (store (s32) into %ir.lsr.iv)
+ %3:intregs = A2_addi %1, -8
+ ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+ J2_jump %bb.2, implicit-def dead $pc
+
+ bb.2.exit:
+ PS_jmpret $r31, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
index f25e988..086ef54 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
@@ -352,6 +352,81 @@ entry:
ret void
}
+define void @buildvector_v32i8_partial(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a5, i8 %a7, i8 %a8, i8 %a15, i8 %a17, i8 %a18, i8 %a20, i8 %a22, i8 %a23, i8 %a27, i8 %a28, i8 %a31) nounwind {
+; CHECK-LABEL: buildvector_v32i8_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi.d $sp, $sp, -96
+; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT: addi.d $fp, $sp, 96
+; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
+; CHECK-NEXT: ld.b $t0, $fp, 0
+; CHECK-NEXT: ld.b $t1, $fp, 8
+; CHECK-NEXT: ld.b $t2, $fp, 16
+; CHECK-NEXT: ld.b $t3, $fp, 24
+; CHECK-NEXT: ld.b $t4, $fp, 56
+; CHECK-NEXT: ld.b $t5, $fp, 48
+; CHECK-NEXT: ld.b $t6, $fp, 40
+; CHECK-NEXT: ld.b $t7, $fp, 32
+; CHECK-NEXT: st.b $t4, $sp, 63
+; CHECK-NEXT: st.b $t5, $sp, 60
+; CHECK-NEXT: st.b $t6, $sp, 59
+; CHECK-NEXT: st.b $t7, $sp, 55
+; CHECK-NEXT: st.b $t3, $sp, 54
+; CHECK-NEXT: st.b $t2, $sp, 52
+; CHECK-NEXT: st.b $t1, $sp, 50
+; CHECK-NEXT: st.b $t0, $sp, 49
+; CHECK-NEXT: st.b $a7, $sp, 47
+; CHECK-NEXT: st.b $a6, $sp, 40
+; CHECK-NEXT: st.b $a5, $sp, 39
+; CHECK-NEXT: st.b $a4, $sp, 37
+; CHECK-NEXT: st.b $a3, $sp, 34
+; CHECK-NEXT: st.b $a2, $sp, 33
+; CHECK-NEXT: st.b $a1, $sp, 32
+; CHECK-NEXT: xvld $xr0, $sp, 32
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: addi.d $sp, $fp, -96
+; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT: addi.d $sp, $sp, 96
+; CHECK-NEXT: ret
+entry:
+ %ins0 = insertelement <32 x i8> undef, i8 %a0, i32 0
+ %ins1 = insertelement <32 x i8> %ins0, i8 %a1, i32 1
+ %ins2 = insertelement <32 x i8> %ins1, i8 %a2, i32 2
+ %ins3 = insertelement <32 x i8> %ins2, i8 undef, i32 3
+ %ins4 = insertelement <32 x i8> %ins3, i8 undef, i32 4
+ %ins5 = insertelement <32 x i8> %ins4, i8 %a5, i32 5
+ %ins6 = insertelement <32 x i8> %ins5, i8 undef, i32 6
+ %ins7 = insertelement <32 x i8> %ins6, i8 %a7, i32 7
+ %ins8 = insertelement <32 x i8> %ins7, i8 %a8, i32 8
+ %ins9 = insertelement <32 x i8> %ins8, i8 undef, i32 9
+ %ins10 = insertelement <32 x i8> %ins9, i8 undef, i32 10
+ %ins11 = insertelement <32 x i8> %ins10, i8 undef, i32 11
+ %ins12 = insertelement <32 x i8> %ins11, i8 undef, i32 12
+ %ins13 = insertelement <32 x i8> %ins12, i8 undef, i32 13
+ %ins14 = insertelement <32 x i8> %ins13, i8 undef, i32 14
+ %ins15 = insertelement <32 x i8> %ins14, i8 %a15, i32 15
+ %ins16 = insertelement <32 x i8> %ins15, i8 undef, i32 16
+ %ins17 = insertelement <32 x i8> %ins16, i8 %a17, i32 17
+ %ins18 = insertelement <32 x i8> %ins17, i8 %a18, i32 18
+ %ins19 = insertelement <32 x i8> %ins18, i8 undef, i32 19
+ %ins20 = insertelement <32 x i8> %ins19, i8 %a20, i32 20
+ %ins21 = insertelement <32 x i8> %ins20, i8 undef, i32 21
+ %ins22 = insertelement <32 x i8> %ins21, i8 %a22, i32 22
+ %ins23 = insertelement <32 x i8> %ins22, i8 %a23, i32 23
+ %ins24 = insertelement <32 x i8> %ins23, i8 undef, i32 24
+ %ins25 = insertelement <32 x i8> %ins24, i8 undef, i32 25
+ %ins26 = insertelement <32 x i8> %ins25, i8 undef, i32 26
+ %ins27 = insertelement <32 x i8> %ins26, i8 %a27, i32 27
+ %ins28 = insertelement <32 x i8> %ins27, i8 %a28, i32 28
+ %ins29 = insertelement <32 x i8> %ins28, i8 undef, i32 29
+ %ins30 = insertelement <32 x i8> %ins29, i8 undef, i32 30
+ %ins31 = insertelement <32 x i8> %ins30, i8 %a31, i32 31
+ store <32 x i8> %ins31, ptr %dst
+ ret void
+}
+
define void @buildvector_v16i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
; CHECK-LABEL: buildvector_v16i16:
; CHECK: # %bb.0: # %entry
@@ -419,6 +494,49 @@ entry:
ret void
}
+define void @buildvector_v16i16_partial(ptr %dst, i16 %a0, i16 %a2, i16 %a5, i16 %a6, i16 %a7, i16 %a12, i16 %a13) nounwind {
+; CHECK-LABEL: buildvector_v16i16_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi.d $sp, $sp, -96
+; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT: addi.d $fp, $sp, 96
+; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
+; CHECK-NEXT: st.h $a7, $sp, 58
+; CHECK-NEXT: st.h $a6, $sp, 56
+; CHECK-NEXT: st.h $a5, $sp, 46
+; CHECK-NEXT: st.h $a4, $sp, 44
+; CHECK-NEXT: st.h $a3, $sp, 42
+; CHECK-NEXT: st.h $a2, $sp, 36
+; CHECK-NEXT: st.h $a1, $sp, 32
+; CHECK-NEXT: xvld $xr0, $sp, 32
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: addi.d $sp, $fp, -96
+; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT: addi.d $sp, $sp, 96
+; CHECK-NEXT: ret
+entry:
+ %ins0 = insertelement <16 x i16> undef, i16 %a0, i32 0
+ %ins1 = insertelement <16 x i16> %ins0, i16 undef, i32 1
+ %ins2 = insertelement <16 x i16> %ins1, i16 %a2, i32 2
+ %ins3 = insertelement <16 x i16> %ins2, i16 undef, i32 3
+ %ins4 = insertelement <16 x i16> %ins3, i16 undef, i32 4
+ %ins5 = insertelement <16 x i16> %ins4, i16 %a5, i32 5
+ %ins6 = insertelement <16 x i16> %ins5, i16 %a6, i32 6
+ %ins7 = insertelement <16 x i16> %ins6, i16 %a7, i32 7
+ %ins8 = insertelement <16 x i16> %ins7, i16 undef, i32 8
+ %ins9 = insertelement <16 x i16> %ins8, i16 undef, i32 9
+ %ins10 = insertelement <16 x i16> %ins9, i16 undef, i32 10
+ %ins11 = insertelement <16 x i16> %ins10, i16 undef, i32 11
+ %ins12 = insertelement <16 x i16> %ins11, i16 %a12, i32 12
+ %ins13 = insertelement <16 x i16> %ins12, i16 %a13, i32 13
+ %ins14 = insertelement <16 x i16> %ins13, i16 undef, i32 14
+ %ins15 = insertelement <16 x i16> %ins14, i16 undef, i32 15
+ store <16 x i16> %ins15, ptr %dst
+ ret void
+}
+
define void @buildvector_v8i32(ptr %dst, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
; CHECK-LABEL: buildvector_v8i32:
; CHECK: # %bb.0: # %entry
@@ -446,6 +564,38 @@ entry:
ret void
}
+define void @buildvector_v8i32_partial(ptr %dst, i32 %a2, i32 %a4, i32 %a5, i32 %a6) nounwind {
+; CHECK-LABEL: buildvector_v8i32_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi.d $sp, $sp, -96
+; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT: addi.d $fp, $sp, 96
+; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
+; CHECK-NEXT: st.w $a4, $sp, 56
+; CHECK-NEXT: st.w $a3, $sp, 52
+; CHECK-NEXT: st.w $a2, $sp, 48
+; CHECK-NEXT: st.w $a1, $sp, 40
+; CHECK-NEXT: xvld $xr0, $sp, 32
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: addi.d $sp, $fp, -96
+; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT: addi.d $sp, $sp, 96
+; CHECK-NEXT: ret
+entry:
+ %ins0 = insertelement <8 x i32> undef, i32 undef, i32 0
+ %ins1 = insertelement <8 x i32> %ins0, i32 undef, i32 1
+ %ins2 = insertelement <8 x i32> %ins1, i32 %a2, i32 2
+ %ins3 = insertelement <8 x i32> %ins2, i32 undef, i32 3
+ %ins4 = insertelement <8 x i32> %ins3, i32 %a4, i32 4
+ %ins5 = insertelement <8 x i32> %ins4, i32 %a5, i32 5
+ %ins6 = insertelement <8 x i32> %ins5, i32 %a6, i32 6
+ %ins7 = insertelement <8 x i32> %ins6, i32 undef, i32 7
+ store <8 x i32> %ins7, ptr %dst
+ ret void
+}
+
define void @buildvector_v4i64(ptr %dst, i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
; CHECK-LABEL: buildvector_v4i64:
; CHECK: # %bb.0: # %entry
@@ -464,25 +614,43 @@ entry:
ret void
}
+define void @buildvector_v4i64_partial(ptr %dst, i64 %a1, i64 %a2) nounwind {
+; CHECK-LABEL: buildvector_v4i64_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvinsgr2vr.d $xr0, $a2, 0
+; CHECK-NEXT: xvpermi.d $xr0, $xr0, 68
+; CHECK-NEXT: xvinsgr2vr.d $xr1, $a1, 0
+; CHECK-NEXT: xvpermi.d $xr1, $xr1, 68
+; CHECK-NEXT: xvpackev.d $xr0, $xr1, $xr0
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %ins0 = insertelement <4 x i64> undef, i64 undef, i32 0
+ %ins1 = insertelement <4 x i64> %ins0, i64 %a1, i32 1
+ %ins2 = insertelement <4 x i64> %ins1, i64 %a2, i32 2
+ %ins3 = insertelement <4 x i64> %ins2, i64 undef, i32 3
+ store <4 x i64> %ins3, ptr %dst
+ ret void
+}
+
define void @buildvector_v8f32(ptr %dst, float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
; CHECK-LABEL: buildvector_v8f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movfr2gr.s $a1, $fa0
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 0
-; CHECK-NEXT: movfr2gr.s $a1, $fa1
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 1
-; CHECK-NEXT: movfr2gr.s $a1, $fa2
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 2
-; CHECK-NEXT: movfr2gr.s $a1, $fa3
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 3
-; CHECK-NEXT: movfr2gr.s $a1, $fa4
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 4
-; CHECK-NEXT: movfr2gr.s $a1, $fa5
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 5
-; CHECK-NEXT: movfr2gr.s $a1, $fa6
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 6
-; CHECK-NEXT: movfr2gr.s $a1, $fa7
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 7
+; CHECK-NEXT: # kill: def $f7 killed $f7 def $xr7
+; CHECK-NEXT: # kill: def $f6 killed $f6 def $xr6
+; CHECK-NEXT: # kill: def $f5 killed $f5 def $xr5
+; CHECK-NEXT: # kill: def $f4 killed $f4 def $xr4
+; CHECK-NEXT: # kill: def $f3 killed $f3 def $xr3
+; CHECK-NEXT: # kill: def $f2 killed $f2 def $xr2
+; CHECK-NEXT: # kill: def $f1 killed $f1 def $xr1
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT: xvinsve0.w $xr0, $xr1, 1
+; CHECK-NEXT: xvinsve0.w $xr0, $xr2, 2
+; CHECK-NEXT: xvinsve0.w $xr0, $xr3, 3
+; CHECK-NEXT: xvinsve0.w $xr0, $xr4, 4
+; CHECK-NEXT: xvinsve0.w $xr0, $xr5, 5
+; CHECK-NEXT: xvinsve0.w $xr0, $xr6, 6
+; CHECK-NEXT: xvinsve0.w $xr0, $xr7, 7
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -498,17 +666,48 @@ entry:
ret void
}
+define void @buildvector_v8f32_partial(ptr %dst, float %a1, float %a2, float %a5, float %a7) nounwind {
+; CHECK-LABEL: buildvector_v8f32_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi.d $sp, $sp, -96
+; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT: addi.d $fp, $sp, 96
+; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
+; CHECK-NEXT: fst.s $fa3, $sp, 60
+; CHECK-NEXT: fst.s $fa2, $sp, 52
+; CHECK-NEXT: fst.s $fa1, $sp, 40
+; CHECK-NEXT: fst.s $fa0, $sp, 36
+; CHECK-NEXT: xvld $xr0, $sp, 32
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: addi.d $sp, $fp, -96
+; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT: addi.d $sp, $sp, 96
+; CHECK-NEXT: ret
+entry:
+ %ins0 = insertelement <8 x float> undef, float undef, i32 0
+ %ins1 = insertelement <8 x float> %ins0, float %a1, i32 1
+ %ins2 = insertelement <8 x float> %ins1, float %a2, i32 2
+ %ins3 = insertelement <8 x float> %ins2, float undef, i32 3
+ %ins4 = insertelement <8 x float> %ins3, float undef, i32 4
+ %ins5 = insertelement <8 x float> %ins4, float %a5, i32 5
+ %ins6 = insertelement <8 x float> %ins5, float undef, i32 6
+ %ins7 = insertelement <8 x float> %ins6, float %a7, i32 7
+ store <8 x float> %ins7, ptr %dst
+ ret void
+}
+
define void @buildvector_v4f64(ptr %dst, double %a0, double %a1, double %a2, double %a3) nounwind {
; CHECK-LABEL: buildvector_v4f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movfr2gr.d $a1, $fa0
-; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 0
-; CHECK-NEXT: movfr2gr.d $a1, $fa1
-; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 1
-; CHECK-NEXT: movfr2gr.d $a1, $fa2
-; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 2
-; CHECK-NEXT: movfr2gr.d $a1, $fa3
-; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 3
+; CHECK-NEXT: # kill: def $f3_64 killed $f3_64 def $xr3
+; CHECK-NEXT: # kill: def $f2_64 killed $f2_64 def $xr2
+; CHECK-NEXT: # kill: def $f1_64 killed $f1_64 def $xr1
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT: xvinsve0.d $xr0, $xr1, 1
+; CHECK-NEXT: xvinsve0.d $xr0, $xr2, 2
+; CHECK-NEXT: xvinsve0.d $xr0, $xr3, 3
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -519,3 +718,22 @@ entry:
store <4 x double> %ins3, ptr %dst
ret void
}
+
+define void @buildvector_v4f64_partial(ptr %dst, double %a0, double %a3) nounwind {
+; CHECK-LABEL: buildvector_v4f64_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $f1_64 killed $f1_64 def $xr1
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT: xvpermi.d $xr0, $xr0, 68
+; CHECK-NEXT: xvpermi.d $xr1, $xr1, 68
+; CHECK-NEXT: xvpackev.d $xr0, $xr1, $xr0
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %ins0 = insertelement <4 x double> undef, double %a0, i32 0
+ %ins1 = insertelement <4 x double> %ins0, double undef, i32 1
+ %ins2 = insertelement <4 x double> %ins1, double undef, i32 2
+ %ins3 = insertelement <4 x double> %ins2, double %a3, i32 3
+ store <4 x double> %ins3, ptr %dst
+ ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
index 9528280..3800712 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
@@ -11,23 +11,22 @@ define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill
; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
; CHECK-NEXT: addi.w $fp, $a0, 0
-; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 0
+; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 1
; CHECK-NEXT: movgr2fr.w $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 0
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0
; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
-; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 1
+; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 0
; CHECK-NEXT: movgr2fr.w $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
-; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 1
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsve0.w $xr0, $xr1, 1
; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 2
@@ -35,59 +34,60 @@ define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
-; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 2
-; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 2
+; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill
; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 3
; CHECK-NEXT: movgr2fr.w $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
-; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 3
-; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 3
+; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill
; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 4
; CHECK-NEXT: movgr2fr.w $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
-; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 4
-; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 4
+; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill
; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 5
; CHECK-NEXT: movgr2fr.w $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
-; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 5
-; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 5
+; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill
; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 6
; CHECK-NEXT: movgr2fr.w $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
-; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 6
-; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 6
+; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill
; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 7
; CHECK-NEXT: movgr2fr.w $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
-; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 7
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 7
+; CHECK-NEXT: xvori.b $xr0, $xr1, 0
; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload
; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload
; CHECK-NEXT: addi.d $sp, $sp, 96
@@ -105,45 +105,45 @@ define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -96
; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill
; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill
-; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
; CHECK-NEXT: addi.w $fp, $a0, 0
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
; CHECK-NEXT: movgr2fr.d $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.d $a0, $fa0
-; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 0
-; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
-; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
; CHECK-NEXT: movgr2fr.d $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.d $a0, $fa0
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsve0.d $xr0, $xr1, 1
+; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 1
-; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
-; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
; CHECK-NEXT: movgr2fr.d $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.d $a0, $fa0
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsve0.d $xr1, $xr0, 2
+; CHECK-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 2
-; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill
-; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
; CHECK-NEXT: movgr2fr.d $fa0, $a0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.d $a0, $fa0
-; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 3
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT: xvinsve0.d $xr1, $xr0, 3
+; CHECK-NEXT: xvori.b $xr0, $xr1, 0
; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload
; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload
; CHECK-NEXT: addi.d $sp, $sp, 96
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
index f154dd3..221aba3 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -6,15 +6,12 @@
define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
; CHECK-LABEL: shufflevector_v4f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 0
; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 2
-; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 1
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 2
+; CHECK-NEXT: xvpickve2gr.d $a1, $xr0, 3
+; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 1
+; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 2
; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 3
-; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 3
-; CHECK-NEXT: xvori.b $xr0, $xr2, 0
+; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 3
; CHECK-NEXT: ret
entry:
%c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 6, i32 3, i32 7>
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
index b24f95e..c1d4220 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
@@ -87,8 +87,8 @@ define void @insert_8xfloat(ptr %src, ptr %dst, float %in) nounwind {
; CHECK-LABEL: insert_8xfloat:
; CHECK: # %bb.0:
; CHECK-NEXT: xvld $xr1, $a0, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
-; CHECK-NEXT: xvinsgr2vr.w $xr1, $a0, 1
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 1
; CHECK-NEXT: xvst $xr1, $a1, 0
; CHECK-NEXT: ret
%v = load volatile <8 x float>, ptr %src
@@ -101,8 +101,8 @@ define void @insert_4xdouble(ptr %src, ptr %dst, double %in) nounwind {
; CHECK-LABEL: insert_4xdouble:
; CHECK: # %bb.0:
; CHECK-NEXT: xvld $xr1, $a0, 0
-; CHECK-NEXT: movfr2gr.d $a0, $fa0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT: xvinsve0.d $xr1, $xr0, 1
; CHECK-NEXT: xvst $xr1, $a1, 0
; CHECK-NEXT: ret
%v = load volatile <4 x double>, ptr %src
diff --git a/llvm/test/CodeGen/LoongArch/llvm.exp10.ll b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll
index 7a52531..62ea5cb 100644
--- a/llvm/test/CodeGen/LoongArch/llvm.exp10.ll
+++ b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll
@@ -196,21 +196,20 @@ define <2 x double> @exp10_v2f64(<2 x double> %x) #0 {
; LA64-NEXT: addi.d $sp, $sp, -48
; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT: vreplvei.d $vr0, $vr0, 0
+; LA64-NEXT: vreplvei.d $vr0, $vr0, 1
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; LA64-NEXT: pcaddu18i $ra, %call36(exp10)
; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: movfr2gr.d $a0, $fa0
-; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
-; LA64-NEXT: vreplvei.d $vr0, $vr0, 1
+; LA64-NEXT: vreplvei.d $vr0, $vr0, 0
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; LA64-NEXT: pcaddu18i $ra, %call36(exp10)
; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: movfr2gr.d $a0, $fa0
-; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 1
+; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
+; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT: vextrins.d $vr0, $vr1, 16
; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
; LA64-NEXT: addi.d $sp, $sp, 48
; LA64-NEXT: ret
diff --git a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
index 648c19d..383d63c 100644
--- a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
+++ b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
@@ -571,39 +571,37 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) #0 {
; LA64-NEXT: addi.d $sp, $sp, -80
; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill
; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT: vreplvei.d $vr0, $vr0, 0
-; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT: vreplvei.d $vr0, $vr0, 1
+; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; LA64-NEXT: pcaddu18i $ra, %call36(sin)
; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: movfr2gr.d $a0, $fa0
-; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0
-; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill
+; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
+; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill
; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT: vreplvei.d $vr0, $vr0, 1
+; LA64-NEXT: vreplvei.d $vr0, $vr0, 0
; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; LA64-NEXT: pcaddu18i $ra, %call36(sin)
; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: movfr2gr.d $a0, $fa0
+; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
+; LA64-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT: vextrins.d $vr0, $vr1, 16
+; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill
; LA64-NEXT: vld $vr0, $sp, 48 # 16-byte Folded Reload
-; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 1
-; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill
-; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; LA64-NEXT: pcaddu18i $ra, %call36(cos)
; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: movfr2gr.d $a0, $fa0
-; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0
-; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
+; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill
; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; LA64-NEXT: pcaddu18i $ra, %call36(cos)
; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: movfr2gr.d $a0, $fa0
-; LA64-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload
-; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1
+; LA64-NEXT: fmov.d $fa1, $fa0
; LA64-NEXT: vld $vr0, $sp, 48 # 16-byte Folded Reload
+; LA64-NEXT: vextrins.d $vr1, $vr0, 16
+; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload
; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload
; LA64-NEXT: addi.d $sp, $sp, 80
; LA64-NEXT: ret
diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
index d84e408..4dda012 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -272,6 +272,41 @@ entry:
ret void
}
+define void @buildvector_v16i8_partial(ptr %dst, i8 %a2, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) nounwind {
+; CHECK-LABEL: buildvector_v16i8_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi.d $sp, $sp, -16
+; CHECK-NEXT: st.b $a6, $sp, 15
+; CHECK-NEXT: st.b $a5, $sp, 12
+; CHECK-NEXT: st.b $a4, $sp, 11
+; CHECK-NEXT: st.b $a3, $sp, 8
+; CHECK-NEXT: st.b $a2, $sp, 6
+; CHECK-NEXT: st.b $a1, $sp, 2
+; CHECK-NEXT: vld $vr0, $sp, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: addi.d $sp, $sp, 16
+; CHECK-NEXT: ret
+entry:
+ %ins0 = insertelement <16 x i8> undef, i8 undef, i32 0
+ %ins1 = insertelement <16 x i8> %ins0, i8 undef, i32 1
+ %ins2 = insertelement <16 x i8> %ins1, i8 %a2, i32 2
+ %ins3 = insertelement <16 x i8> %ins2, i8 undef, i32 3
+ %ins4 = insertelement <16 x i8> %ins3, i8 undef, i32 4
+ %ins5 = insertelement <16 x i8> %ins4, i8 undef, i32 5
+ %ins6 = insertelement <16 x i8> %ins5, i8 %a6, i32 6
+ %ins7 = insertelement <16 x i8> %ins6, i8 undef, i32 7
+ %ins8 = insertelement <16 x i8> %ins7, i8 %a8, i32 8
+ %ins9 = insertelement <16 x i8> %ins8, i8 undef, i32 9
+ %ins10 = insertelement <16 x i8> %ins9, i8 undef, i32 10
+ %ins11 = insertelement <16 x i8> %ins10, i8 %a11, i32 11
+ %ins12 = insertelement <16 x i8> %ins11, i8 %a12, i32 12
+ %ins13 = insertelement <16 x i8> %ins12, i8 undef, i32 13
+ %ins14 = insertelement <16 x i8> %ins13, i8 undef, i32 14
+ %ins15 = insertelement <16 x i8> %ins14, i8 %a15, i32 15
+ store <16 x i8> %ins15, ptr %dst
+ ret void
+}
+
define void @buildvector_v8i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
; CHECK-LABEL: buildvector_v8i16:
; CHECK: # %bb.0: # %entry
@@ -299,6 +334,31 @@ entry:
ret void
}
+define void @buildvector_v8i16_partial(ptr %dst, i16 %a1, i16 %a3, i16 %a4, i16 %a5) nounwind {
+; CHECK-LABEL: buildvector_v8i16_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi.d $sp, $sp, -16
+; CHECK-NEXT: st.h $a4, $sp, 10
+; CHECK-NEXT: st.h $a3, $sp, 8
+; CHECK-NEXT: st.h $a2, $sp, 6
+; CHECK-NEXT: st.h $a1, $sp, 2
+; CHECK-NEXT: vld $vr0, $sp, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: addi.d $sp, $sp, 16
+; CHECK-NEXT: ret
+entry:
+ %ins0 = insertelement <8 x i16> undef, i16 undef, i32 0
+ %ins1 = insertelement <8 x i16> %ins0, i16 %a1, i32 1
+ %ins2 = insertelement <8 x i16> %ins1, i16 undef, i32 2
+ %ins3 = insertelement <8 x i16> %ins2, i16 %a3, i32 3
+ %ins4 = insertelement <8 x i16> %ins3, i16 %a4, i32 4
+ %ins5 = insertelement <8 x i16> %ins4, i16 %a5, i32 5
+ %ins6 = insertelement <8 x i16> %ins5, i16 undef, i32 6
+ %ins7 = insertelement <8 x i16> %ins6, i16 undef, i32 7
+ store <8 x i16> %ins7, ptr %dst
+ ret void
+}
+
define void @buildvector_v4i32(ptr %dst, i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
; CHECK-LABEL: buildvector_v4i32:
; CHECK: # %bb.0: # %entry
@@ -317,6 +377,25 @@ entry:
ret void
}
+define void @buildvector_v4i32_partial(ptr %dst, i32 %a0, i32 %a3) nounwind {
+; CHECK-LABEL: buildvector_v4i32_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI23_0)
+; CHECK-NEXT: vld $vr0, $a3, %pc_lo12(.LCPI23_0)
+; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 0
+; CHECK-NEXT: vinsgr2vr.w $vr2, $a2, 0
+; CHECK-NEXT: vshuf.w $vr0, $vr2, $vr1
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %ins0 = insertelement <4 x i32> undef, i32 %a0, i32 0
+ %ins1 = insertelement <4 x i32> %ins0, i32 undef, i32 1
+ %ins2 = insertelement <4 x i32> %ins1, i32 undef, i32 2
+ %ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3
+ store <4 x i32> %ins3, ptr %dst
+ ret void
+}
+
define void @buildvector_v2i64(ptr %dst, i64 %a0, i64 %a1) nounwind {
; CHECK-LABEL: buildvector_v2i64:
; CHECK: # %bb.0: # %entry
@@ -331,17 +410,29 @@ entry:
ret void
}
+define void @buildvector_v2i64_partial(ptr %dst, i64 %a0) nounwind {
+; CHECK-LABEL: buildvector_v2i64_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vinsgr2vr.d $vr0, $a1, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %ins0 = insertelement <2 x i64> undef, i64 %a0, i32 0
+ %ins1 = insertelement <2 x i64> %ins0, i64 undef, i32 1
+ store <2 x i64> %ins1, ptr %dst
+ ret void
+}
+
define void @buildvector_v4f32(ptr %dst, float %a0, float %a1, float %a2, float %a3) nounwind {
; CHECK-LABEL: buildvector_v4f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movfr2gr.s $a1, $fa0
-; CHECK-NEXT: vinsgr2vr.w $vr0, $a1, 0
-; CHECK-NEXT: movfr2gr.s $a1, $fa1
-; CHECK-NEXT: vinsgr2vr.w $vr0, $a1, 1
-; CHECK-NEXT: movfr2gr.s $a1, $fa2
-; CHECK-NEXT: vinsgr2vr.w $vr0, $a1, 2
-; CHECK-NEXT: movfr2gr.s $a1, $fa3
-; CHECK-NEXT: vinsgr2vr.w $vr0, $a1, 3
+; CHECK-NEXT: # kill: def $f3 killed $f3 def $vr3
+; CHECK-NEXT: # kill: def $f2 killed $f2 def $vr2
+; CHECK-NEXT: # kill: def $f1 killed $f1 def $vr1
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT: vextrins.w $vr0, $vr1, 16
+; CHECK-NEXT: vextrins.w $vr0, $vr2, 32
+; CHECK-NEXT: vextrins.w $vr0, $vr3, 48
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -353,13 +444,31 @@ entry:
ret void
}
+define void @buildvector_v4f32_partial(ptr %dst, float %a0, float %a3) nounwind {
+; CHECK-LABEL: buildvector_v4f32_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI27_0)
+; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI27_0)
+; CHECK-NEXT: # kill: def $f1 killed $f1 def $vr1
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT: vshuf.w $vr2, $vr1, $vr0
+; CHECK-NEXT: vst $vr2, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %ins0 = insertelement <4 x float> undef, float %a0, i32 0
+ %ins1 = insertelement <4 x float> %ins0, float undef, i32 1
+ %ins2 = insertelement <4 x float> %ins1, float undef, i32 2
+ %ins3 = insertelement <4 x float> %ins2, float %a3, i32 3
+ store <4 x float> %ins3, ptr %dst
+ ret void
+}
+
define void @buildvector_v2f64(ptr %dst, double %a0, double %a1) nounwind {
; CHECK-LABEL: buildvector_v2f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movfr2gr.d $a1, $fa0
-; CHECK-NEXT: vinsgr2vr.d $vr0, $a1, 0
-; CHECK-NEXT: movfr2gr.d $a1, $fa1
-; CHECK-NEXT: vinsgr2vr.d $vr0, $a1, 1
+; CHECK-NEXT: # kill: def $f1_64 killed $f1_64 def $vr1
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
+; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -369,6 +478,20 @@ entry:
ret void
}
+define void @buildvector_v2f64_partial(ptr %dst, double %a1) nounwind {
+; CHECK-LABEL: buildvector_v2f64_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
+; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %ins0 = insertelement <2 x double> undef, double undef, i32 0
+ %ins1 = insertelement <2 x double> %ins0, double %a1, i32 1
+ store <2 x double> %ins1, ptr %dst
+ ret void
+}
+
;; If `isShuffleMaskLegal` returns true, it will lead to an infinite loop.
define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll
index aafef07..735dad4 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll
@@ -9,45 +9,45 @@ define <4 x float> @powi_v4f32(<4 x float> %va, i32 %b) nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -48
; CHECK-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
; CHECK-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
-; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
; CHECK-NEXT: addi.w $fp, $a0, 0
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 1
; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
-; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
-; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 1
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; CHECK-NEXT: vextrins.w $vr0, $vr1, 16
+; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
-; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 1
-; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
-; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
; CHECK-NEXT: vreplvei.w $vr0, $vr0, 2
; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; CHECK-NEXT: vextrins.w $vr1, $vr0, 32
+; CHECK-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
-; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 2
-; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
-; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
-; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
-; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 3
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; CHECK-NEXT: vextrins.w $vr1, $vr0, 48
+; CHECK-NEXT: vori.b $vr0, $vr1, 0
; CHECK-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
; CHECK-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
; CHECK-NEXT: addi.d $sp, $sp, 48
@@ -67,23 +67,22 @@ define <2 x double> @powi_v2f64(<2 x double> %va, i32 %b) nounwind {
; CHECK-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
; CHECK-NEXT: addi.w $fp, $a0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT: vreplvei.d $vr0, $vr0, 1
; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.d $a0, $fa0
-; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill
; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 1
+; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0
; CHECK-NEXT: move $a0, $fp
; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2)
; CHECK-NEXT: jirl $ra, $ra, 0
-; CHECK-NEXT: movfr2gr.d $a0, $fa0
-; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload
-; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 1
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
+; CHECK-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload
+; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
; CHECK-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
; CHECK-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
; CHECK-NEXT: addi.d $sp, $sp, 48
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
index 7f23207..c73252b 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
@@ -57,8 +57,8 @@ define void @insert_4xfloat(ptr %src, ptr %dst, float %ins) nounwind {
; CHECK-LABEL: insert_4xfloat:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr1, $a0, 0
-; CHECK-NEXT: movfr2gr.s $a0, $fa0
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 1
+; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT: vextrins.w $vr1, $vr0, 16
; CHECK-NEXT: vst $vr1, $a1, 0
; CHECK-NEXT: ret
%v = load volatile <4 x float>, ptr %src
@@ -71,8 +71,8 @@ define void @insert_2xdouble(ptr %src, ptr %dst, double %ins) nounwind {
; CHECK-LABEL: insert_2xdouble:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr1, $a0, 0
-; CHECK-NEXT: movfr2gr.d $a0, $fa0
-; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
+; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0
+; CHECK-NEXT: vextrins.d $vr1, $vr0, 16
; CHECK-NEXT: vst $vr1, $a1, 0
; CHECK-NEXT: ret
%v = load volatile <2 x double>, ptr %src
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
index 0ee3012..ad57bbf 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
@@ -588,3 +588,18 @@ define i2 @vmsk_trunc_i64(<2 x i64> %a) {
%res = bitcast <2 x i1> %y to i2
ret i2 %res
}
+
+define i4 @vmsk_eq_allzeros_v4i8(<4 x i8> %a) {
+; CHECK-LABEL: vmsk_eq_allzeros_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vseqi.b $vr0, $vr0, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vslli.w $vr0, $vr0, 24
+; CHECK-NEXT: vmskltz.w $vr0, $vr0
+; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
+; CHECK-NEXT: ret
+ %1 = icmp eq <4 x i8> %a, zeroinitializer
+ %2 = bitcast <4 x i1> %1 to i4
+ ret i4 %2
+}
diff --git a/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll b/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll
index eb656ad..6e9d26a 100644
--- a/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll
+++ b/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll
@@ -24,9 +24,9 @@
; NO-WARNING-NOT: warning: triple-implied ABI conflicts with provided target-abi 'lp64d', using target-abi
;; Check that ILP32-on-LA64 and LP64-on-LA32 combinations are handled properly.
-; RUN: llc --mtriple=loongarch64 --target-abi=ilp32d --mattr=+d < %s 2>&1 \
+; RUN: llc --mtriple=loongarch64-linux-gnu --target-abi=ilp32d --mattr=+d < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=LP64D,32ON64
-; RUN: llc --mtriple=loongarch32 --target-abi=lp64d --mattr=+d < %s 2>&1 \
+; RUN: llc --mtriple=loongarch32-linux-gnu --target-abi=lp64d --mattr=+d < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=ILP32D,64ON32
; 32ON64: warning: 32-bit ABIs are not supported for 64-bit targets, ignoring and using triple-implied ABI
@@ -49,12 +49,6 @@
; LP64D-LP64F-NOF: warning: both target-abi and the triple-implied ABI are invalid, ignoring and using feature-implied ABI
-;; Check that triple-implied ABI are invalid, use feature-implied ABI
-; RUN: llc --mtriple=loongarch64 --mattr=-f < %s 2>&1 \
-; RUN: | FileCheck %s --check-prefixes=LP64S,LP64D-NONE-NOF
-
-; LP64D-NONE-NOF: warning: the triple-implied ABI is invalid, ignoring and using feature-implied ABI
-
define float @f(float %a) {
; ILP32D-LABEL: f:
; ILP32D: # %bb.0:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index b514c493..278cf01 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -46,6 +46,7 @@
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
entry:
@@ -315,6 +316,7 @@
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {
entry:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index fc730f9..890ea44 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -46,6 +46,7 @@
; AFTER-PEI-NEXT: hasInitWholeWave: false
; AFTER-PEI-NEXT: dynamicVGPRBlockSize: 0
; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0
+; AFTER-PEI-NEXT: isWholeWaveFunction: false
; AFTER-PEI-NEXT: body:
define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {
%wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 5adef14..f84ef8a 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -46,6 +46,7 @@
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {
bb0:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index fa40164..cc834d0 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -46,6 +46,7 @@
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {
bb0:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 24565e4..06c580e 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -55,6 +55,7 @@
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
@@ -162,6 +163,7 @@ body: |
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
@@ -240,6 +242,7 @@ body: |
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
@@ -319,6 +322,7 @@ body: |
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index a152713..4271546 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -56,6 +56,7 @@
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
%gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0
@@ -105,6 +106,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
%gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0
@@ -178,6 +180,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define void @function() {
ret void
@@ -233,6 +236,7 @@ define void @function() {
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define void @function_nsz() #0 {
ret void
diff --git a/llvm/test/CodeGen/MSP430/llvm.exp10.ll b/llvm/test/CodeGen/MSP430/llvm.exp10.ll
new file mode 100644
index 0000000..7d4cf7e3
--- /dev/null
+++ b/llvm/test/CodeGen/MSP430/llvm.exp10.ll
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=msp430-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=msp430-unknown-linux < %s | FileCheck %s
+; RUN: llc -mtriple=msp430-unknown-linux-gnu < %s | FileCheck %s
+
+define half @exp10_f16(half %x) #0 {
+; CHECK-LABEL: exp10_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: call #__extendhfsf2
+; CHECK-NEXT: call #exp10f
+; CHECK-NEXT: call #__truncsfhf2
+; CHECK-NEXT: ret
+ %r = call half @llvm.exp10.f16(half %x)
+ ret half %r
+}
+
+define <2 x half> @exp10_v2f16(<2 x half> %x) #0 {
+; CHECK-LABEL: exp10_v2f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: push r9
+; CHECK-NEXT: push r10
+; CHECK-NEXT: mov r13, r10
+; CHECK-NEXT: call #__extendhfsf2
+; CHECK-NEXT: call #exp10f
+; CHECK-NEXT: call #__truncsfhf2
+; CHECK-NEXT: mov r12, r9
+; CHECK-NEXT: mov r10, r12
+; CHECK-NEXT: call #__extendhfsf2
+; CHECK-NEXT: call #exp10f
+; CHECK-NEXT: call #__truncsfhf2
+; CHECK-NEXT: mov r12, r13
+; CHECK-NEXT: mov r9, r12
+; CHECK-NEXT: pop r10
+; CHECK-NEXT: pop r9
+; CHECK-NEXT: ret
+ %r = call <2 x half> @llvm.exp10.v2f16(<2 x half> %x)
+ ret <2 x half> %r
+}
+
+define float @exp10_f32(float %x) #0 {
+; CHECK-LABEL: exp10_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: call #exp10f
+; CHECK-NEXT: ret
+ %r = call float @llvm.exp10.f32(float %x)
+ ret float %r
+}
+
+define <2 x float> @exp10_v2f32(<2 x float> %x) #0 {
+; CHECK-LABEL: exp10_v2f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: push r7
+; CHECK-NEXT: push r8
+; CHECK-NEXT: push r9
+; CHECK-NEXT: push r10
+; CHECK-NEXT: mov r15, r10
+; CHECK-NEXT: mov r14, r9
+; CHECK-NEXT: call #exp10f
+; CHECK-NEXT: mov r12, r8
+; CHECK-NEXT: mov r13, r7
+; CHECK-NEXT: mov r9, r12
+; CHECK-NEXT: mov r10, r13
+; CHECK-NEXT: call #exp10f
+; CHECK-NEXT: mov r12, r14
+; CHECK-NEXT: mov r13, r15
+; CHECK-NEXT: mov r8, r12
+; CHECK-NEXT: mov r7, r13
+; CHECK-NEXT: pop r10
+; CHECK-NEXT: pop r9
+; CHECK-NEXT: pop r8
+; CHECK-NEXT: pop r7
+; CHECK-NEXT: ret
+ %r = call <2 x float> @llvm.exp10.v2f32(<2 x float> %x)
+ ret <2 x float> %r
+}
+
+define double @exp10_f64(double %x) #0 {
+; CHECK-LABEL: exp10_f64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: call #exp10
+; CHECK-NEXT: ret
+ %r = call double @llvm.exp10.f64(double %x)
+ ret double %r
+}
+
+define <2 x double> @exp10_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: exp10_v2f64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: push r10
+; CHECK-NEXT: mov r12, r10
+; CHECK-NEXT: mov 12(r1), r12
+; CHECK-NEXT: mov 14(r1), r13
+; CHECK-NEXT: mov 16(r1), r14
+; CHECK-NEXT: mov 18(r1), r15
+; CHECK-NEXT: call #exp10
+; CHECK-NEXT: mov r15, 14(r10)
+; CHECK-NEXT: mov r14, 12(r10)
+; CHECK-NEXT: mov r13, 10(r10)
+; CHECK-NEXT: mov r12, 8(r10)
+; CHECK-NEXT: mov 4(r1), r12
+; CHECK-NEXT: mov 6(r1), r13
+; CHECK-NEXT: mov 8(r1), r14
+; CHECK-NEXT: mov 10(r1), r15
+; CHECK-NEXT: call #exp10
+; CHECK-NEXT: mov r15, 6(r10)
+; CHECK-NEXT: mov r14, 4(r10)
+; CHECK-NEXT: mov r13, 2(r10)
+; CHECK-NEXT: mov r12, 0(r10)
+; CHECK-NEXT: pop r10
+; CHECK-NEXT: ret
+ %r = call <2 x double> @llvm.exp10.v2f64(<2 x double> %x)
+ ret <2 x double> %r
+}
+
+define fp128 @exp10_f128(fp128 %x) #0 {
+; CHECK-LABEL: exp10_f128:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: push r10
+; CHECK-NEXT: sub #32, r1
+; CHECK-NEXT: mov r12, r10
+; CHECK-NEXT: mov 50(r1), 14(r1)
+; CHECK-NEXT: mov 48(r1), 12(r1)
+; CHECK-NEXT: mov 46(r1), 10(r1)
+; CHECK-NEXT: mov 44(r1), 8(r1)
+; CHECK-NEXT: mov 42(r1), 6(r1)
+; CHECK-NEXT: mov 40(r1), 4(r1)
+; CHECK-NEXT: mov 38(r1), 2(r1)
+; CHECK-NEXT: mov 36(r1), 0(r1)
+; CHECK-NEXT: mov r1, r12
+; CHECK-NEXT: add #16, r12
+; CHECK-NEXT: call #exp10l
+; CHECK-NEXT: mov 30(r1), 14(r10)
+; CHECK-NEXT: mov 28(r1), 12(r10)
+; CHECK-NEXT: mov 26(r1), 10(r10)
+; CHECK-NEXT: mov 24(r1), 8(r10)
+; CHECK-NEXT: mov 22(r1), 6(r10)
+; CHECK-NEXT: mov 20(r1), 4(r10)
+; CHECK-NEXT: mov 18(r1), 2(r10)
+; CHECK-NEXT: mov 16(r1), 0(r10)
+; CHECK-NEXT: add #32, r1
+; CHECK-NEXT: pop r10
+; CHECK-NEXT: ret
+ %r = call fp128 @llvm.exp10.f128(fp128 %x)
+ ret fp128 %r
+}
+
+define <2 x fp128> @exp10_v2f128(<2 x fp128> %x) #0 {
+; CHECK-LABEL: exp10_v2f128:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: push r10
+; CHECK-NEXT: sub #48, r1
+; CHECK-NEXT: mov r12, r10
+; CHECK-NEXT: mov 82(r1), 14(r1)
+; CHECK-NEXT: mov 80(r1), 12(r1)
+; CHECK-NEXT: mov 78(r1), 10(r1)
+; CHECK-NEXT: mov 76(r1), 8(r1)
+; CHECK-NEXT: mov 74(r1), 6(r1)
+; CHECK-NEXT: mov 72(r1), 4(r1)
+; CHECK-NEXT: mov 70(r1), 2(r1)
+; CHECK-NEXT: mov 68(r1), 0(r1)
+; CHECK-NEXT: mov r1, r12
+; CHECK-NEXT: add #32, r12
+; CHECK-NEXT: call #exp10l
+; CHECK-NEXT: mov 66(r1), 14(r1)
+; CHECK-NEXT: mov 64(r1), 12(r1)
+; CHECK-NEXT: mov 62(r1), 10(r1)
+; CHECK-NEXT: mov 60(r1), 8(r1)
+; CHECK-NEXT: mov 58(r1), 6(r1)
+; CHECK-NEXT: mov 56(r1), 4(r1)
+; CHECK-NEXT: mov 54(r1), 2(r1)
+; CHECK-NEXT: mov 52(r1), 0(r1)
+; CHECK-NEXT: mov r1, r12
+; CHECK-NEXT: add #16, r12
+; CHECK-NEXT: call #exp10l
+; CHECK-NEXT: mov 46(r1), 30(r10)
+; CHECK-NEXT: mov 44(r1), 28(r10)
+; CHECK-NEXT: mov 42(r1), 26(r10)
+; CHECK-NEXT: mov 40(r1), 24(r10)
+; CHECK-NEXT: mov 38(r1), 22(r10)
+; CHECK-NEXT: mov 36(r1), 20(r10)
+; CHECK-NEXT: mov 34(r1), 18(r10)
+; CHECK-NEXT: mov 32(r1), 16(r10)
+; CHECK-NEXT: mov 30(r1), 14(r10)
+; CHECK-NEXT: mov 28(r1), 12(r10)
+; CHECK-NEXT: mov 26(r1), 10(r10)
+; CHECK-NEXT: mov 24(r1), 8(r10)
+; CHECK-NEXT: mov 22(r1), 6(r10)
+; CHECK-NEXT: mov 20(r1), 4(r10)
+; CHECK-NEXT: mov 18(r1), 2(r10)
+; CHECK-NEXT: mov 16(r1), 0(r10)
+; CHECK-NEXT: add #48, r1
+; CHECK-NEXT: pop r10
+; CHECK-NEXT: ret
+ %r = call <2 x fp128> @llvm.exp10.v2f128(<2 x fp128> %x)
+ ret <2 x fp128> %r
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index 23832a9..dd9a472 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -181,32 +181,32 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
; ENABLED-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U;
; ENABLED-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U;
; ENABLED-NEXT: prmt.b32 %r7, %r4, 0, 0x7771U;
-; ENABLED-NEXT: prmt.b32 %r8, %r4, 0, 0x7770U;
-; ENABLED-NEXT: prmt.b32 %r9, %r3, 0, 0x7773U;
-; ENABLED-NEXT: prmt.b32 %r10, %r3, 0, 0x7772U;
-; ENABLED-NEXT: prmt.b32 %r11, %r3, 0, 0x7771U;
-; ENABLED-NEXT: prmt.b32 %r12, %r3, 0, 0x7770U;
-; ENABLED-NEXT: prmt.b32 %r13, %r2, 0, 0x7773U;
-; ENABLED-NEXT: prmt.b32 %r14, %r2, 0, 0x7772U;
-; ENABLED-NEXT: prmt.b32 %r15, %r2, 0, 0x7771U;
-; ENABLED-NEXT: prmt.b32 %r16, %r2, 0, 0x7770U;
-; ENABLED-NEXT: prmt.b32 %r17, %r1, 0, 0x7773U;
-; ENABLED-NEXT: prmt.b32 %r18, %r1, 0, 0x7772U;
-; ENABLED-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U;
-; ENABLED-NEXT: prmt.b32 %r20, %r1, 0, 0x7770U;
+; ENABLED-NEXT: prmt.b32 %r8, %r3, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r9, %r3, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r10, %r3, 0, 0x7771U;
+; ENABLED-NEXT: prmt.b32 %r11, %r2, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r12, %r2, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r13, %r2, 0, 0x7771U;
+; ENABLED-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r15, %r1, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r16, %r1, 0, 0x7771U;
; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_param_1];
-; ENABLED-NEXT: add.s32 %r21, %r20, %r19;
-; ENABLED-NEXT: add.s32 %r22, %r21, %r18;
-; ENABLED-NEXT: add.s32 %r23, %r22, %r17;
-; ENABLED-NEXT: add.s32 %r24, %r23, %r16;
-; ENABLED-NEXT: add.s32 %r25, %r24, %r15;
-; ENABLED-NEXT: add.s32 %r26, %r25, %r14;
-; ENABLED-NEXT: add.s32 %r27, %r26, %r13;
-; ENABLED-NEXT: add.s32 %r28, %r27, %r12;
-; ENABLED-NEXT: add.s32 %r29, %r28, %r11;
-; ENABLED-NEXT: add.s32 %r30, %r29, %r10;
-; ENABLED-NEXT: add.s32 %r31, %r30, %r9;
-; ENABLED-NEXT: add.s32 %r32, %r31, %r8;
+; ENABLED-NEXT: and.b32 %r17, %r1, 255;
+; ENABLED-NEXT: and.b32 %r18, %r2, 255;
+; ENABLED-NEXT: and.b32 %r19, %r3, 255;
+; ENABLED-NEXT: and.b32 %r20, %r4, 255;
+; ENABLED-NEXT: add.s32 %r21, %r17, %r16;
+; ENABLED-NEXT: add.s32 %r22, %r21, %r15;
+; ENABLED-NEXT: add.s32 %r23, %r22, %r14;
+; ENABLED-NEXT: add.s32 %r24, %r23, %r18;
+; ENABLED-NEXT: add.s32 %r25, %r24, %r13;
+; ENABLED-NEXT: add.s32 %r26, %r25, %r12;
+; ENABLED-NEXT: add.s32 %r27, %r26, %r11;
+; ENABLED-NEXT: add.s32 %r28, %r27, %r19;
+; ENABLED-NEXT: add.s32 %r29, %r28, %r10;
+; ENABLED-NEXT: add.s32 %r30, %r29, %r9;
+; ENABLED-NEXT: add.s32 %r31, %r30, %r8;
+; ENABLED-NEXT: add.s32 %r32, %r31, %r20;
; ENABLED-NEXT: add.s32 %r33, %r32, %r7;
; ENABLED-NEXT: add.s32 %r34, %r33, %r6;
; ENABLED-NEXT: add.s32 %r35, %r34, %r5;
@@ -332,36 +332,36 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
; ENABLED-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
; ENABLED-NEXT: prmt.b32 %r4, %r2, 0, 0x7772U;
; ENABLED-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
-; ENABLED-NEXT: prmt.b32 %r6, %r2, 0, 0x7770U;
-; ENABLED-NEXT: prmt.b32 %r7, %r1, 0, 0x7773U;
-; ENABLED-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
-; ENABLED-NEXT: prmt.b32 %r9, %r1, 0, 0x7771U;
-; ENABLED-NEXT: prmt.b32 %r10, %r1, 0, 0x7770U;
+; ENABLED-NEXT: prmt.b32 %r6, %r1, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r8, %r1, 0, 0x7771U;
; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_unaligned_param_1];
-; ENABLED-NEXT: ld.v2.b32 {%r11, %r12}, [%rd1+8];
-; ENABLED-NEXT: prmt.b32 %r13, %r12, 0, 0x7773U;
-; ENABLED-NEXT: prmt.b32 %r14, %r12, 0, 0x7772U;
-; ENABLED-NEXT: prmt.b32 %r15, %r12, 0, 0x7771U;
-; ENABLED-NEXT: prmt.b32 %r16, %r12, 0, 0x7770U;
-; ENABLED-NEXT: prmt.b32 %r17, %r11, 0, 0x7773U;
-; ENABLED-NEXT: prmt.b32 %r18, %r11, 0, 0x7772U;
-; ENABLED-NEXT: prmt.b32 %r19, %r11, 0, 0x7771U;
-; ENABLED-NEXT: prmt.b32 %r20, %r11, 0, 0x7770U;
-; ENABLED-NEXT: add.s32 %r21, %r10, %r9;
-; ENABLED-NEXT: add.s32 %r22, %r21, %r8;
-; ENABLED-NEXT: add.s32 %r23, %r22, %r7;
-; ENABLED-NEXT: add.s32 %r24, %r23, %r6;
+; ENABLED-NEXT: ld.v2.b32 {%r9, %r10}, [%rd1+8];
+; ENABLED-NEXT: prmt.b32 %r11, %r10, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r12, %r10, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r13, %r10, 0, 0x7771U;
+; ENABLED-NEXT: prmt.b32 %r14, %r9, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r15, %r9, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r16, %r9, 0, 0x7771U;
+; ENABLED-NEXT: and.b32 %r17, %r1, 255;
+; ENABLED-NEXT: and.b32 %r18, %r2, 255;
+; ENABLED-NEXT: and.b32 %r19, %r9, 255;
+; ENABLED-NEXT: and.b32 %r20, %r10, 255;
+; ENABLED-NEXT: add.s32 %r21, %r17, %r8;
+; ENABLED-NEXT: add.s32 %r22, %r21, %r7;
+; ENABLED-NEXT: add.s32 %r23, %r22, %r6;
+; ENABLED-NEXT: add.s32 %r24, %r23, %r18;
; ENABLED-NEXT: add.s32 %r25, %r24, %r5;
; ENABLED-NEXT: add.s32 %r26, %r25, %r4;
; ENABLED-NEXT: add.s32 %r27, %r26, %r3;
-; ENABLED-NEXT: add.s32 %r28, %r27, %r20;
-; ENABLED-NEXT: add.s32 %r29, %r28, %r19;
-; ENABLED-NEXT: add.s32 %r30, %r29, %r18;
-; ENABLED-NEXT: add.s32 %r31, %r30, %r17;
-; ENABLED-NEXT: add.s32 %r32, %r31, %r16;
-; ENABLED-NEXT: add.s32 %r33, %r32, %r15;
-; ENABLED-NEXT: add.s32 %r34, %r33, %r14;
-; ENABLED-NEXT: add.s32 %r35, %r34, %r13;
+; ENABLED-NEXT: add.s32 %r28, %r27, %r19;
+; ENABLED-NEXT: add.s32 %r29, %r28, %r16;
+; ENABLED-NEXT: add.s32 %r30, %r29, %r15;
+; ENABLED-NEXT: add.s32 %r31, %r30, %r14;
+; ENABLED-NEXT: add.s32 %r32, %r31, %r20;
+; ENABLED-NEXT: add.s32 %r33, %r32, %r13;
+; ENABLED-NEXT: add.s32 %r34, %r33, %r12;
+; ENABLED-NEXT: add.s32 %r35, %r34, %r11;
; ENABLED-NEXT: st.b32 [%rd2], %r35;
; ENABLED-NEXT: ret;
;
diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll
index 80980ef..d61a63c 100644
--- a/llvm/test/CodeGen/NVPTX/extractelement.ll
+++ b/llvm/test/CodeGen/NVPTX/extractelement.ll
@@ -56,23 +56,22 @@ define i16 @test_v4i8(i32 %a) {
; CHECK-LABEL: test_v4i8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<8>;
-; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_param_0];
-; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x8880U;
-; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x9991U;
-; CHECK-NEXT: cvt.u16.u32 %rs2, %r3;
-; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U;
-; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
-; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xbbb3U;
-; CHECK-NEXT: cvt.u16.u32 %rs4, %r5;
+; CHECK-NEXT: cvt.s8.s32 %rs1, %r1;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x9991U;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r2;
+; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0xaaa2U;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r3;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xbbb3U;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r4;
; CHECK-NEXT: add.s16 %rs5, %rs1, %rs2;
; CHECK-NEXT: add.s16 %rs6, %rs3, %rs4;
; CHECK-NEXT: add.s16 %rs7, %rs5, %rs6;
-; CHECK-NEXT: cvt.u32.u16 %r6, %rs7;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs7;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NEXT: ret;
%v = bitcast i32 %a to <4 x i8>
%r0 = extractelement <4 x i8> %v, i64 0
@@ -96,7 +95,7 @@ define i32 @test_v4i8_s32(i32 %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_s32_param_0];
-; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x8880U;
+; CHECK-NEXT: cvt.s32.s8 %r2, %r1;
; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x9991U;
; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U;
; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xbbb3U;
@@ -127,12 +126,12 @@ define i32 @test_v4i8_u32(i32 %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_u32_param_0];
-; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7770U;
-; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x7771U;
-; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
-; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7773U;
-; CHECK-NEXT: add.s32 %r6, %r2, %r3;
-; CHECK-NEXT: add.s32 %r7, %r4, %r5;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U;
+; CHECK-NEXT: and.b32 %r5, %r1, 255;
+; CHECK-NEXT: add.s32 %r6, %r5, %r2;
+; CHECK-NEXT: add.s32 %r7, %r3, %r4;
; CHECK-NEXT: add.s32 %r8, %r6, %r7;
; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
; CHECK-NEXT: ret;
@@ -157,26 +156,24 @@ define i16 @test_v8i8(i64 %a) {
; CHECK-LABEL: test_v8i8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<16>;
-; CHECK-NEXT: .reg .b32 %r<12>;
+; CHECK-NEXT: .reg .b32 %r<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v8i8_param_0];
-; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x8880U;
-; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x9991U;
-; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
-; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xaaa2U;
-; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
-; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0xbbb3U;
-; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x8880U;
-; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x9991U;
-; CHECK-NEXT: cvt.u16.u32 %rs6, %r8;
-; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0xaaa2U;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
-; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0xbbb3U;
-; CHECK-NEXT: cvt.u16.u32 %rs8, %r10;
+; CHECK-NEXT: cvt.s8.s32 %rs1, %r1;
+; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x9991U;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r3;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xbbb3U;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r5;
+; CHECK-NEXT: cvt.s8.s32 %rs5, %r2;
+; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x9991U;
+; CHECK-NEXT: cvt.u16.u32 %rs6, %r6;
+; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r7;
+; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0xbbb3U;
+; CHECK-NEXT: cvt.u16.u32 %rs8, %r8;
; CHECK-NEXT: add.s16 %rs9, %rs1, %rs2;
; CHECK-NEXT: add.s16 %rs10, %rs3, %rs4;
; CHECK-NEXT: add.s16 %rs11, %rs5, %rs6;
@@ -184,8 +181,8 @@ define i16 @test_v8i8(i64 %a) {
; CHECK-NEXT: add.s16 %rs13, %rs9, %rs10;
; CHECK-NEXT: add.s16 %rs14, %rs11, %rs12;
; CHECK-NEXT: add.s16 %rs15, %rs13, %rs14;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs15;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r11;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
; CHECK-NEXT: ret;
%v = bitcast i64 %a to <8 x i8>
%r0 = extractelement <8 x i8> %v, i64 0
diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll
index f1adc34..9a051b3 100644
--- a/llvm/test/CodeGen/NVPTX/i1-select.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-select.ll
@@ -94,27 +94,27 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) {
; CHECK-LABEL: test_select_i1_basic_folding(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<12>;
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .pred %p<13>;
+; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_folding_param_0];
; CHECK-NEXT: setp.eq.b32 %p1, %r1, 0;
-; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_1];
-; CHECK-NEXT: setp.ne.b32 %p2, %r3, 0;
-; CHECK-NEXT: setp.eq.b32 %p3, %r3, 0;
-; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_2];
-; CHECK-NEXT: setp.eq.b32 %p4, %r5, 0;
-; CHECK-NEXT: ld.param.b32 %r6, [test_select_i1_basic_folding_param_3];
+; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_basic_folding_param_1];
+; CHECK-NEXT: setp.ne.b32 %p2, %r2, 0;
+; CHECK-NEXT: setp.eq.b32 %p3, %r2, 0;
+; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2];
+; CHECK-NEXT: setp.eq.b32 %p4, %r3, 0;
+; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3];
; CHECK-NEXT: xor.pred %p6, %p1, %p3;
-; CHECK-NEXT: ld.param.b32 %r7, [test_select_i1_basic_folding_param_4];
+; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4];
; CHECK-NEXT: and.pred %p7, %p6, %p4;
-; CHECK-NEXT: and.pred %p8, %p2, %p4;
-; CHECK-NEXT: and.pred %p9, %p3, %p7;
-; CHECK-NEXT: or.pred %p10, %p9, %p8;
-; CHECK-NEXT: xor.pred %p11, %p10, %p3;
-; CHECK-NEXT: selp.b32 %r8, %r6, %r7, %p11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: and.pred %p9, %p2, %p4;
+; CHECK-NEXT: and.pred %p10, %p3, %p7;
+; CHECK-NEXT: or.pred %p11, %p10, %p9;
+; CHECK-NEXT: xor.pred %p12, %p11, %p3;
+; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p12;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
; CHECK-NEXT: ret;
%b1 = icmp eq i32 %v1, 0
%b2 = icmp eq i32 %v2, 0
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index f2211eb..44d8558 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -5,9 +5,9 @@
define i128 @srem_i128(i128 %lhs, i128 %rhs) {
; CHECK-LABEL: srem_i128(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<22>;
+; CHECK-NEXT: .reg .pred %p<20>;
; CHECK-NEXT: .reg .b32 %r<12>;
-; CHECK-NEXT: .reg .b64 %rd<126>;
+; CHECK-NEXT: .reg .b64 %rd<127>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0];
@@ -42,103 +42,102 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd62, %r4;
; CHECK-NEXT: add.s64 %rd63, %rd62, 64;
; CHECK-NEXT: selp.b64 %rd64, %rd61, %rd63, %p7;
-; CHECK-NEXT: mov.b64 %rd116, 0;
+; CHECK-NEXT: mov.b64 %rd117, 0;
; CHECK-NEXT: sub.cc.s64 %rd66, %rd60, %rd64;
-; CHECK-NEXT: subc.cc.s64 %rd8, %rd116, 0;
-; CHECK-NEXT: setp.ne.b64 %p8, %rd8, 0;
-; CHECK-NEXT: and.pred %p10, %p8, %p8;
-; CHECK-NEXT: setp.eq.b64 %p11, %rd8, 0;
-; CHECK-NEXT: setp.gt.u64 %p12, %rd66, 127;
-; CHECK-NEXT: and.pred %p13, %p11, %p12;
-; CHECK-NEXT: or.pred %p14, %p13, %p10;
-; CHECK-NEXT: or.pred %p15, %p5, %p14;
-; CHECK-NEXT: xor.b64 %rd67, %rd66, 127;
-; CHECK-NEXT: or.b64 %rd68, %rd67, %rd8;
-; CHECK-NEXT: setp.eq.b64 %p16, %rd68, 0;
-; CHECK-NEXT: selp.b64 %rd125, 0, %rd4, %p15;
-; CHECK-NEXT: selp.b64 %rd124, 0, %rd3, %p15;
-; CHECK-NEXT: or.pred %p17, %p15, %p16;
-; CHECK-NEXT: @%p17 bra $L__BB0_5;
+; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0;
+; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127;
+; CHECK-NEXT: setp.eq.b64 %p9, %rd67, 0;
+; CHECK-NEXT: and.pred %p10, %p9, %p8;
+; CHECK-NEXT: setp.ne.b64 %p11, %rd67, 0;
+; CHECK-NEXT: or.pred %p12, %p10, %p11;
+; CHECK-NEXT: or.pred %p13, %p5, %p12;
+; CHECK-NEXT: xor.b64 %rd68, %rd66, 127;
+; CHECK-NEXT: or.b64 %rd69, %rd68, %rd67;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd69, 0;
+; CHECK-NEXT: selp.b64 %rd126, 0, %rd4, %p13;
+; CHECK-NEXT: selp.b64 %rd125, 0, %rd3, %p13;
+; CHECK-NEXT: or.pred %p15, %p13, %p14;
+; CHECK-NEXT: @%p15 bra $L__BB0_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd118, %rd66, 1;
-; CHECK-NEXT: addc.cc.s64 %rd119, %rd8, 0;
-; CHECK-NEXT: or.b64 %rd71, %rd118, %rd119;
-; CHECK-NEXT: setp.eq.b64 %p18, %rd71, 0;
+; CHECK-NEXT: add.cc.s64 %rd119, %rd66, 1;
+; CHECK-NEXT: addc.cc.s64 %rd120, %rd67, 0;
+; CHECK-NEXT: or.b64 %rd72, %rd119, %rd120;
+; CHECK-NEXT: setp.eq.b64 %p16, %rd72, 0;
; CHECK-NEXT: cvt.u32.u64 %r5, %rd66;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd72, %rd4, %r6;
+; CHECK-NEXT: shl.b64 %rd73, %rd4, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd73, %rd3, %r7;
-; CHECK-NEXT: or.b64 %rd74, %rd72, %rd73;
+; CHECK-NEXT: shr.u64 %rd74, %rd3, %r7;
+; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd75, %rd3, %r8;
-; CHECK-NEXT: setp.gt.s32 %p19, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd123, %rd75, %rd74, %p19;
-; CHECK-NEXT: shl.b64 %rd122, %rd3, %r6;
-; CHECK-NEXT: mov.b64 %rd113, %rd116;
-; CHECK-NEXT: @%p18 bra $L__BB0_4;
+; CHECK-NEXT: shl.b64 %rd76, %rd3, %r8;
+; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd124, %rd76, %rd75, %p17;
+; CHECK-NEXT: shl.b64 %rd123, %rd3, %r6;
+; CHECK-NEXT: mov.b64 %rd114, %rd117;
+; CHECK-NEXT: @%p16 bra $L__BB0_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd118;
-; CHECK-NEXT: shr.u64 %rd78, %rd3, %r9;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd119;
+; CHECK-NEXT: shr.u64 %rd79, %rd3, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd79, %rd4, %r10;
-; CHECK-NEXT: or.b64 %rd80, %rd78, %rd79;
+; CHECK-NEXT: shl.b64 %rd80, %rd4, %r10;
+; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd81, %rd4, %r11;
-; CHECK-NEXT: setp.gt.s32 %p20, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd120, %rd81, %rd80, %p20;
-; CHECK-NEXT: shr.u64 %rd121, %rd4, %r9;
+; CHECK-NEXT: shr.u64 %rd82, %rd4, %r11;
+; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT: selp.b64 %rd121, %rd82, %rd81, %p18;
+; CHECK-NEXT: shr.u64 %rd122, %rd4, %r9;
; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1;
; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1;
-; CHECK-NEXT: mov.b64 %rd113, 0;
-; CHECK-NEXT: mov.b64 %rd116, %rd113;
+; CHECK-NEXT: mov.b64 %rd114, 0;
+; CHECK-NEXT: mov.b64 %rd117, %rd114;
; CHECK-NEXT: $L__BB0_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd82, %rd120, 63;
-; CHECK-NEXT: shl.b64 %rd83, %rd121, 1;
-; CHECK-NEXT: or.b64 %rd84, %rd83, %rd82;
-; CHECK-NEXT: shl.b64 %rd85, %rd120, 1;
-; CHECK-NEXT: shr.u64 %rd86, %rd123, 63;
-; CHECK-NEXT: or.b64 %rd87, %rd85, %rd86;
-; CHECK-NEXT: shr.u64 %rd88, %rd122, 63;
-; CHECK-NEXT: shl.b64 %rd89, %rd123, 1;
-; CHECK-NEXT: or.b64 %rd90, %rd89, %rd88;
-; CHECK-NEXT: shl.b64 %rd91, %rd122, 1;
-; CHECK-NEXT: or.b64 %rd122, %rd116, %rd91;
-; CHECK-NEXT: or.b64 %rd123, %rd113, %rd90;
-; CHECK-NEXT: sub.cc.s64 %rd92, %rd35, %rd87;
-; CHECK-NEXT: subc.cc.s64 %rd93, %rd36, %rd84;
-; CHECK-NEXT: shr.s64 %rd94, %rd93, 63;
-; CHECK-NEXT: and.b64 %rd116, %rd94, 1;
-; CHECK-NEXT: and.b64 %rd95, %rd94, %rd5;
-; CHECK-NEXT: and.b64 %rd96, %rd94, %rd6;
-; CHECK-NEXT: sub.cc.s64 %rd120, %rd87, %rd95;
-; CHECK-NEXT: subc.cc.s64 %rd121, %rd84, %rd96;
-; CHECK-NEXT: add.cc.s64 %rd118, %rd118, -1;
-; CHECK-NEXT: addc.cc.s64 %rd119, %rd119, -1;
-; CHECK-NEXT: or.b64 %rd97, %rd118, %rd119;
-; CHECK-NEXT: setp.eq.b64 %p21, %rd97, 0;
-; CHECK-NEXT: @%p21 bra $L__BB0_4;
+; CHECK-NEXT: shr.u64 %rd83, %rd121, 63;
+; CHECK-NEXT: shl.b64 %rd84, %rd122, 1;
+; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83;
+; CHECK-NEXT: shl.b64 %rd86, %rd121, 1;
+; CHECK-NEXT: shr.u64 %rd87, %rd124, 63;
+; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87;
+; CHECK-NEXT: shr.u64 %rd89, %rd123, 63;
+; CHECK-NEXT: shl.b64 %rd90, %rd124, 1;
+; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT: shl.b64 %rd92, %rd123, 1;
+; CHECK-NEXT: or.b64 %rd123, %rd117, %rd92;
+; CHECK-NEXT: or.b64 %rd124, %rd114, %rd91;
+; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88;
+; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85;
+; CHECK-NEXT: shr.s64 %rd95, %rd94, 63;
+; CHECK-NEXT: and.b64 %rd117, %rd95, 1;
+; CHECK-NEXT: and.b64 %rd96, %rd95, %rd5;
+; CHECK-NEXT: and.b64 %rd97, %rd95, %rd6;
+; CHECK-NEXT: sub.cc.s64 %rd121, %rd88, %rd96;
+; CHECK-NEXT: subc.cc.s64 %rd122, %rd85, %rd97;
+; CHECK-NEXT: add.cc.s64 %rd119, %rd119, -1;
+; CHECK-NEXT: addc.cc.s64 %rd120, %rd120, -1;
+; CHECK-NEXT: or.b64 %rd98, %rd119, %rd120;
+; CHECK-NEXT: setp.eq.b64 %p19, %rd98, 0;
+; CHECK-NEXT: @%p19 bra $L__BB0_4;
; CHECK-NEXT: bra.uni $L__BB0_2;
; CHECK-NEXT: $L__BB0_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd98, %rd122, 63;
-; CHECK-NEXT: shl.b64 %rd99, %rd123, 1;
-; CHECK-NEXT: or.b64 %rd100, %rd99, %rd98;
-; CHECK-NEXT: shl.b64 %rd101, %rd122, 1;
-; CHECK-NEXT: or.b64 %rd124, %rd116, %rd101;
-; CHECK-NEXT: or.b64 %rd125, %rd113, %rd100;
+; CHECK-NEXT: shr.u64 %rd99, %rd123, 63;
+; CHECK-NEXT: shl.b64 %rd100, %rd124, 1;
+; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99;
+; CHECK-NEXT: shl.b64 %rd102, %rd123, 1;
+; CHECK-NEXT: or.b64 %rd125, %rd117, %rd102;
+; CHECK-NEXT: or.b64 %rd126, %rd114, %rd101;
; CHECK-NEXT: $L__BB0_5: // %udiv-end
-; CHECK-NEXT: mul.hi.u64 %rd102, %rd5, %rd124;
-; CHECK-NEXT: mad.lo.s64 %rd103, %rd5, %rd125, %rd102;
-; CHECK-NEXT: mad.lo.s64 %rd104, %rd6, %rd124, %rd103;
-; CHECK-NEXT: mul.lo.s64 %rd105, %rd5, %rd124;
-; CHECK-NEXT: sub.cc.s64 %rd106, %rd3, %rd105;
-; CHECK-NEXT: subc.cc.s64 %rd107, %rd4, %rd104;
-; CHECK-NEXT: xor.b64 %rd108, %rd106, %rd2;
+; CHECK-NEXT: mul.hi.u64 %rd103, %rd5, %rd125;
+; CHECK-NEXT: mad.lo.s64 %rd104, %rd5, %rd126, %rd103;
+; CHECK-NEXT: mad.lo.s64 %rd105, %rd6, %rd125, %rd104;
+; CHECK-NEXT: mul.lo.s64 %rd106, %rd5, %rd125;
+; CHECK-NEXT: sub.cc.s64 %rd107, %rd3, %rd106;
+; CHECK-NEXT: subc.cc.s64 %rd108, %rd4, %rd105;
; CHECK-NEXT: xor.b64 %rd109, %rd107, %rd2;
-; CHECK-NEXT: sub.cc.s64 %rd110, %rd108, %rd2;
-; CHECK-NEXT: subc.cc.s64 %rd111, %rd109, %rd2;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd110, %rd111};
+; CHECK-NEXT: xor.b64 %rd110, %rd108, %rd2;
+; CHECK-NEXT: sub.cc.s64 %rd111, %rd109, %rd2;
+; CHECK-NEXT: subc.cc.s64 %rd112, %rd110, %rd2;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd111, %rd112};
; CHECK-NEXT: ret;
%div = srem i128 %lhs, %rhs
ret i128 %div
@@ -149,7 +148,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<18>;
; CHECK-NEXT: .reg .b32 %r<12>;
-; CHECK-NEXT: .reg .b64 %rd<111>;
+; CHECK-NEXT: .reg .b64 %rd<113>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0];
@@ -173,98 +172,98 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd52, %r4;
; CHECK-NEXT: add.s64 %rd53, %rd52, 64;
; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT: mov.b64 %rd101, 0;
-; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT: subc.cc.s64 %rd6, %rd101, 0;
-; CHECK-NEXT: setp.gt.u64 %p6, %rd5, 127;
-; CHECK-NEXT: setp.eq.b64 %p7, %rd6, 0;
+; CHECK-NEXT: mov.b64 %rd103, 0;
+; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT: subc.cc.s64 %rd57, %rd103, 0;
+; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0;
; CHECK-NEXT: and.pred %p8, %p7, %p6;
-; CHECK-NEXT: setp.ne.b64 %p9, %rd6, 0;
+; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0;
; CHECK-NEXT: or.pred %p10, %p8, %p9;
; CHECK-NEXT: or.pred %p11, %p3, %p10;
-; CHECK-NEXT: xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT: setp.eq.b64 %p12, %rd57, 0;
-; CHECK-NEXT: selp.b64 %rd110, 0, %rd42, %p11;
-; CHECK-NEXT: selp.b64 %rd109, 0, %rd41, %p11;
+; CHECK-NEXT: xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0;
+; CHECK-NEXT: selp.b64 %rd112, 0, %rd42, %p11;
+; CHECK-NEXT: selp.b64 %rd111, 0, %rd41, %p11;
; CHECK-NEXT: or.pred %p13, %p11, %p12;
; CHECK-NEXT: @%p13 bra $L__BB1_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd103, %rd5, 1;
-; CHECK-NEXT: addc.cc.s64 %rd104, %rd6, 0;
-; CHECK-NEXT: or.b64 %rd60, %rd103, %rd104;
-; CHECK-NEXT: setp.eq.b64 %p14, %rd60, 0;
-; CHECK-NEXT: cvt.u32.u64 %r5, %rd5;
+; CHECK-NEXT: add.cc.s64 %rd105, %rd56, 1;
+; CHECK-NEXT: addc.cc.s64 %rd106, %rd57, 0;
+; CHECK-NEXT: or.b64 %rd62, %rd105, %rd106;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd56;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd61, %rd42, %r6;
+; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd62, %rd41, %r7;
-; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7;
+; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd64, %rd41, %r8;
+; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8;
; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd108, %rd64, %rd63, %p15;
-; CHECK-NEXT: shl.b64 %rd107, %rd41, %r6;
-; CHECK-NEXT: mov.b64 %rd98, %rd101;
+; CHECK-NEXT: selp.b64 %rd110, %rd66, %rd65, %p15;
+; CHECK-NEXT: shl.b64 %rd109, %rd41, %r6;
+; CHECK-NEXT: mov.b64 %rd100, %rd103;
; CHECK-NEXT: @%p14 bra $L__BB1_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd103;
-; CHECK-NEXT: shr.u64 %rd67, %rd41, %r9;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd105;
+; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd68, %rd42, %r10;
-; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10;
+; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd70, %rd42, %r11;
+; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11;
; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd105, %rd70, %rd69, %p16;
-; CHECK-NEXT: shr.u64 %rd106, %rd42, %r9;
+; CHECK-NEXT: selp.b64 %rd107, %rd72, %rd71, %p16;
+; CHECK-NEXT: shr.u64 %rd108, %rd42, %r9;
; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1;
; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1;
-; CHECK-NEXT: mov.b64 %rd98, 0;
-; CHECK-NEXT: mov.b64 %rd101, %rd98;
+; CHECK-NEXT: mov.b64 %rd100, 0;
+; CHECK-NEXT: mov.b64 %rd103, %rd100;
; CHECK-NEXT: $L__BB1_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd71, %rd105, 63;
-; CHECK-NEXT: shl.b64 %rd72, %rd106, 1;
-; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT: shl.b64 %rd74, %rd105, 1;
-; CHECK-NEXT: shr.u64 %rd75, %rd108, 63;
-; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT: shr.u64 %rd77, %rd107, 63;
-; CHECK-NEXT: shl.b64 %rd78, %rd108, 1;
-; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT: shl.b64 %rd80, %rd107, 1;
-; CHECK-NEXT: or.b64 %rd107, %rd101, %rd80;
-; CHECK-NEXT: or.b64 %rd108, %rd98, %rd79;
-; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT: shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT: and.b64 %rd101, %rd83, 1;
-; CHECK-NEXT: and.b64 %rd84, %rd83, %rd3;
-; CHECK-NEXT: and.b64 %rd85, %rd83, %rd4;
-; CHECK-NEXT: sub.cc.s64 %rd105, %rd76, %rd84;
-; CHECK-NEXT: subc.cc.s64 %rd106, %rd73, %rd85;
-; CHECK-NEXT: add.cc.s64 %rd103, %rd103, -1;
-; CHECK-NEXT: addc.cc.s64 %rd104, %rd104, -1;
-; CHECK-NEXT: or.b64 %rd86, %rd103, %rd104;
-; CHECK-NEXT: setp.eq.b64 %p17, %rd86, 0;
+; CHECK-NEXT: shr.u64 %rd73, %rd107, 63;
+; CHECK-NEXT: shl.b64 %rd74, %rd108, 1;
+; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT: shl.b64 %rd76, %rd107, 1;
+; CHECK-NEXT: shr.u64 %rd77, %rd110, 63;
+; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT: shr.u64 %rd79, %rd109, 63;
+; CHECK-NEXT: shl.b64 %rd80, %rd110, 1;
+; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT: shl.b64 %rd82, %rd109, 1;
+; CHECK-NEXT: or.b64 %rd109, %rd103, %rd82;
+; CHECK-NEXT: or.b64 %rd110, %rd100, %rd81;
+; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT: shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT: and.b64 %rd103, %rd85, 1;
+; CHECK-NEXT: and.b64 %rd86, %rd85, %rd3;
+; CHECK-NEXT: and.b64 %rd87, %rd85, %rd4;
+; CHECK-NEXT: sub.cc.s64 %rd107, %rd78, %rd86;
+; CHECK-NEXT: subc.cc.s64 %rd108, %rd75, %rd87;
+; CHECK-NEXT: add.cc.s64 %rd105, %rd105, -1;
+; CHECK-NEXT: addc.cc.s64 %rd106, %rd106, -1;
+; CHECK-NEXT: or.b64 %rd88, %rd105, %rd106;
+; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0;
; CHECK-NEXT: @%p17 bra $L__BB1_4;
; CHECK-NEXT: bra.uni $L__BB1_2;
; CHECK-NEXT: $L__BB1_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd87, %rd107, 63;
-; CHECK-NEXT: shl.b64 %rd88, %rd108, 1;
-; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT: shl.b64 %rd90, %rd107, 1;
-; CHECK-NEXT: or.b64 %rd109, %rd101, %rd90;
-; CHECK-NEXT: or.b64 %rd110, %rd98, %rd89;
+; CHECK-NEXT: shr.u64 %rd89, %rd109, 63;
+; CHECK-NEXT: shl.b64 %rd90, %rd110, 1;
+; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT: shl.b64 %rd92, %rd109, 1;
+; CHECK-NEXT: or.b64 %rd111, %rd103, %rd92;
+; CHECK-NEXT: or.b64 %rd112, %rd100, %rd91;
; CHECK-NEXT: $L__BB1_5: // %udiv-end
-; CHECK-NEXT: mul.hi.u64 %rd91, %rd3, %rd109;
-; CHECK-NEXT: mad.lo.s64 %rd92, %rd3, %rd110, %rd91;
-; CHECK-NEXT: mad.lo.s64 %rd93, %rd4, %rd109, %rd92;
-; CHECK-NEXT: mul.lo.s64 %rd94, %rd3, %rd109;
-; CHECK-NEXT: sub.cc.s64 %rd95, %rd41, %rd94;
-; CHECK-NEXT: subc.cc.s64 %rd96, %rd42, %rd93;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd95, %rd96};
+; CHECK-NEXT: mul.hi.u64 %rd93, %rd3, %rd111;
+; CHECK-NEXT: mad.lo.s64 %rd94, %rd3, %rd112, %rd93;
+; CHECK-NEXT: mad.lo.s64 %rd95, %rd4, %rd111, %rd94;
+; CHECK-NEXT: mul.lo.s64 %rd96, %rd3, %rd111;
+; CHECK-NEXT: sub.cc.s64 %rd97, %rd41, %rd96;
+; CHECK-NEXT: subc.cc.s64 %rd98, %rd42, %rd95;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd97, %rd98};
; CHECK-NEXT: ret;
%div = urem i128 %lhs, %rhs
ret i128 %div
@@ -307,9 +306,9 @@ define i128 @urem_i128_pow2k(i128 %lhs) {
define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-LABEL: sdiv_i128(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<22>;
+; CHECK-NEXT: .reg .pred %p<20>;
; CHECK-NEXT: .reg .b32 %r<12>;
-; CHECK-NEXT: .reg .b64 %rd<121>;
+; CHECK-NEXT: .reg .b64 %rd<122>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0];
@@ -345,97 +344,96 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd63, %r4;
; CHECK-NEXT: add.s64 %rd64, %rd63, 64;
; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7;
-; CHECK-NEXT: mov.b64 %rd111, 0;
+; CHECK-NEXT: mov.b64 %rd112, 0;
; CHECK-NEXT: sub.cc.s64 %rd67, %rd61, %rd65;
-; CHECK-NEXT: subc.cc.s64 %rd8, %rd111, 0;
-; CHECK-NEXT: setp.ne.b64 %p8, %rd8, 0;
-; CHECK-NEXT: and.pred %p10, %p8, %p8;
-; CHECK-NEXT: setp.eq.b64 %p11, %rd8, 0;
-; CHECK-NEXT: setp.gt.u64 %p12, %rd67, 127;
-; CHECK-NEXT: and.pred %p13, %p11, %p12;
-; CHECK-NEXT: or.pred %p14, %p13, %p10;
-; CHECK-NEXT: or.pred %p15, %p5, %p14;
-; CHECK-NEXT: xor.b64 %rd68, %rd67, 127;
-; CHECK-NEXT: or.b64 %rd69, %rd68, %rd8;
-; CHECK-NEXT: setp.eq.b64 %p16, %rd69, 0;
-; CHECK-NEXT: selp.b64 %rd120, 0, %rd2, %p15;
-; CHECK-NEXT: selp.b64 %rd119, 0, %rd1, %p15;
-; CHECK-NEXT: or.pred %p17, %p15, %p16;
-; CHECK-NEXT: @%p17 bra $L__BB4_5;
+; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0;
+; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127;
+; CHECK-NEXT: setp.eq.b64 %p9, %rd68, 0;
+; CHECK-NEXT: and.pred %p10, %p9, %p8;
+; CHECK-NEXT: setp.ne.b64 %p11, %rd68, 0;
+; CHECK-NEXT: or.pred %p12, %p10, %p11;
+; CHECK-NEXT: or.pred %p13, %p5, %p12;
+; CHECK-NEXT: xor.b64 %rd69, %rd67, 127;
+; CHECK-NEXT: or.b64 %rd70, %rd69, %rd68;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd70, 0;
+; CHECK-NEXT: selp.b64 %rd121, 0, %rd2, %p13;
+; CHECK-NEXT: selp.b64 %rd120, 0, %rd1, %p13;
+; CHECK-NEXT: or.pred %p15, %p13, %p14;
+; CHECK-NEXT: @%p15 bra $L__BB4_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd113, %rd67, 1;
-; CHECK-NEXT: addc.cc.s64 %rd114, %rd8, 0;
-; CHECK-NEXT: or.b64 %rd72, %rd113, %rd114;
-; CHECK-NEXT: setp.eq.b64 %p18, %rd72, 0;
+; CHECK-NEXT: add.cc.s64 %rd114, %rd67, 1;
+; CHECK-NEXT: addc.cc.s64 %rd115, %rd68, 0;
+; CHECK-NEXT: or.b64 %rd73, %rd114, %rd115;
+; CHECK-NEXT: setp.eq.b64 %p16, %rd73, 0;
; CHECK-NEXT: cvt.u32.u64 %r5, %rd67;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd73, %rd2, %r6;
+; CHECK-NEXT: shl.b64 %rd74, %rd2, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd74, %rd1, %r7;
-; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74;
+; CHECK-NEXT: shr.u64 %rd75, %rd1, %r7;
+; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd76, %rd1, %r8;
-; CHECK-NEXT: setp.gt.s32 %p19, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd118, %rd76, %rd75, %p19;
-; CHECK-NEXT: shl.b64 %rd117, %rd1, %r6;
-; CHECK-NEXT: mov.b64 %rd108, %rd111;
-; CHECK-NEXT: @%p18 bra $L__BB4_4;
+; CHECK-NEXT: shl.b64 %rd77, %rd1, %r8;
+; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd119, %rd77, %rd76, %p17;
+; CHECK-NEXT: shl.b64 %rd118, %rd1, %r6;
+; CHECK-NEXT: mov.b64 %rd109, %rd112;
+; CHECK-NEXT: @%p16 bra $L__BB4_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd113;
-; CHECK-NEXT: shr.u64 %rd79, %rd1, %r9;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd114;
+; CHECK-NEXT: shr.u64 %rd80, %rd1, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd80, %rd2, %r10;
-; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80;
+; CHECK-NEXT: shl.b64 %rd81, %rd2, %r10;
+; CHECK-NEXT: or.b64 %rd82, %rd80, %rd81;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd82, %rd2, %r11;
-; CHECK-NEXT: setp.gt.s32 %p20, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd115, %rd82, %rd81, %p20;
-; CHECK-NEXT: shr.u64 %rd116, %rd2, %r9;
+; CHECK-NEXT: shr.u64 %rd83, %rd2, %r11;
+; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT: selp.b64 %rd116, %rd83, %rd82, %p18;
+; CHECK-NEXT: shr.u64 %rd117, %rd2, %r9;
; CHECK-NEXT: add.cc.s64 %rd35, %rd3, -1;
; CHECK-NEXT: addc.cc.s64 %rd36, %rd4, -1;
-; CHECK-NEXT: mov.b64 %rd108, 0;
-; CHECK-NEXT: mov.b64 %rd111, %rd108;
+; CHECK-NEXT: mov.b64 %rd109, 0;
+; CHECK-NEXT: mov.b64 %rd112, %rd109;
; CHECK-NEXT: $L__BB4_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd83, %rd115, 63;
-; CHECK-NEXT: shl.b64 %rd84, %rd116, 1;
-; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83;
-; CHECK-NEXT: shl.b64 %rd86, %rd115, 1;
-; CHECK-NEXT: shr.u64 %rd87, %rd118, 63;
-; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87;
-; CHECK-NEXT: shr.u64 %rd89, %rd117, 63;
-; CHECK-NEXT: shl.b64 %rd90, %rd118, 1;
-; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT: shl.b64 %rd92, %rd117, 1;
-; CHECK-NEXT: or.b64 %rd117, %rd111, %rd92;
-; CHECK-NEXT: or.b64 %rd118, %rd108, %rd91;
-; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88;
-; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85;
-; CHECK-NEXT: shr.s64 %rd95, %rd94, 63;
-; CHECK-NEXT: and.b64 %rd111, %rd95, 1;
-; CHECK-NEXT: and.b64 %rd96, %rd95, %rd3;
-; CHECK-NEXT: and.b64 %rd97, %rd95, %rd4;
-; CHECK-NEXT: sub.cc.s64 %rd115, %rd88, %rd96;
-; CHECK-NEXT: subc.cc.s64 %rd116, %rd85, %rd97;
-; CHECK-NEXT: add.cc.s64 %rd113, %rd113, -1;
-; CHECK-NEXT: addc.cc.s64 %rd114, %rd114, -1;
-; CHECK-NEXT: or.b64 %rd98, %rd113, %rd114;
-; CHECK-NEXT: setp.eq.b64 %p21, %rd98, 0;
-; CHECK-NEXT: @%p21 bra $L__BB4_4;
+; CHECK-NEXT: shr.u64 %rd84, %rd116, 63;
+; CHECK-NEXT: shl.b64 %rd85, %rd117, 1;
+; CHECK-NEXT: or.b64 %rd86, %rd85, %rd84;
+; CHECK-NEXT: shl.b64 %rd87, %rd116, 1;
+; CHECK-NEXT: shr.u64 %rd88, %rd119, 63;
+; CHECK-NEXT: or.b64 %rd89, %rd87, %rd88;
+; CHECK-NEXT: shr.u64 %rd90, %rd118, 63;
+; CHECK-NEXT: shl.b64 %rd91, %rd119, 1;
+; CHECK-NEXT: or.b64 %rd92, %rd91, %rd90;
+; CHECK-NEXT: shl.b64 %rd93, %rd118, 1;
+; CHECK-NEXT: or.b64 %rd118, %rd112, %rd93;
+; CHECK-NEXT: or.b64 %rd119, %rd109, %rd92;
+; CHECK-NEXT: sub.cc.s64 %rd94, %rd35, %rd89;
+; CHECK-NEXT: subc.cc.s64 %rd95, %rd36, %rd86;
+; CHECK-NEXT: shr.s64 %rd96, %rd95, 63;
+; CHECK-NEXT: and.b64 %rd112, %rd96, 1;
+; CHECK-NEXT: and.b64 %rd97, %rd96, %rd3;
+; CHECK-NEXT: and.b64 %rd98, %rd96, %rd4;
+; CHECK-NEXT: sub.cc.s64 %rd116, %rd89, %rd97;
+; CHECK-NEXT: subc.cc.s64 %rd117, %rd86, %rd98;
+; CHECK-NEXT: add.cc.s64 %rd114, %rd114, -1;
+; CHECK-NEXT: addc.cc.s64 %rd115, %rd115, -1;
+; CHECK-NEXT: or.b64 %rd99, %rd114, %rd115;
+; CHECK-NEXT: setp.eq.b64 %p19, %rd99, 0;
+; CHECK-NEXT: @%p19 bra $L__BB4_4;
; CHECK-NEXT: bra.uni $L__BB4_2;
; CHECK-NEXT: $L__BB4_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd99, %rd117, 63;
-; CHECK-NEXT: shl.b64 %rd100, %rd118, 1;
-; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99;
-; CHECK-NEXT: shl.b64 %rd102, %rd117, 1;
-; CHECK-NEXT: or.b64 %rd119, %rd111, %rd102;
-; CHECK-NEXT: or.b64 %rd120, %rd108, %rd101;
+; CHECK-NEXT: shr.u64 %rd100, %rd118, 63;
+; CHECK-NEXT: shl.b64 %rd101, %rd119, 1;
+; CHECK-NEXT: or.b64 %rd102, %rd101, %rd100;
+; CHECK-NEXT: shl.b64 %rd103, %rd118, 1;
+; CHECK-NEXT: or.b64 %rd120, %rd112, %rd103;
+; CHECK-NEXT: or.b64 %rd121, %rd109, %rd102;
; CHECK-NEXT: $L__BB4_5: // %udiv-end
-; CHECK-NEXT: xor.b64 %rd103, %rd119, %rd5;
; CHECK-NEXT: xor.b64 %rd104, %rd120, %rd5;
-; CHECK-NEXT: sub.cc.s64 %rd105, %rd103, %rd5;
-; CHECK-NEXT: subc.cc.s64 %rd106, %rd104, %rd5;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106};
+; CHECK-NEXT: xor.b64 %rd105, %rd121, %rd5;
+; CHECK-NEXT: sub.cc.s64 %rd106, %rd104, %rd5;
+; CHECK-NEXT: subc.cc.s64 %rd107, %rd105, %rd5;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd106, %rd107};
; CHECK-NEXT: ret;
%div = sdiv i128 %lhs, %rhs
ret i128 %div
@@ -446,7 +444,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<18>;
; CHECK-NEXT: .reg .b32 %r<12>;
-; CHECK-NEXT: .reg .b64 %rd<105>;
+; CHECK-NEXT: .reg .b64 %rd<107>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0];
@@ -470,92 +468,92 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd52, %r4;
; CHECK-NEXT: add.s64 %rd53, %rd52, 64;
; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT: mov.b64 %rd95, 0;
-; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT: subc.cc.s64 %rd6, %rd95, 0;
-; CHECK-NEXT: setp.gt.u64 %p6, %rd5, 127;
-; CHECK-NEXT: setp.eq.b64 %p7, %rd6, 0;
+; CHECK-NEXT: mov.b64 %rd97, 0;
+; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT: subc.cc.s64 %rd57, %rd97, 0;
+; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0;
; CHECK-NEXT: and.pred %p8, %p7, %p6;
-; CHECK-NEXT: setp.ne.b64 %p9, %rd6, 0;
+; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0;
; CHECK-NEXT: or.pred %p10, %p8, %p9;
; CHECK-NEXT: or.pred %p11, %p3, %p10;
-; CHECK-NEXT: xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT: setp.eq.b64 %p12, %rd57, 0;
-; CHECK-NEXT: selp.b64 %rd104, 0, %rd42, %p11;
-; CHECK-NEXT: selp.b64 %rd103, 0, %rd41, %p11;
+; CHECK-NEXT: xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0;
+; CHECK-NEXT: selp.b64 %rd106, 0, %rd42, %p11;
+; CHECK-NEXT: selp.b64 %rd105, 0, %rd41, %p11;
; CHECK-NEXT: or.pred %p13, %p11, %p12;
; CHECK-NEXT: @%p13 bra $L__BB5_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd97, %rd5, 1;
-; CHECK-NEXT: addc.cc.s64 %rd98, %rd6, 0;
-; CHECK-NEXT: or.b64 %rd60, %rd97, %rd98;
-; CHECK-NEXT: setp.eq.b64 %p14, %rd60, 0;
-; CHECK-NEXT: cvt.u32.u64 %r5, %rd5;
+; CHECK-NEXT: add.cc.s64 %rd99, %rd56, 1;
+; CHECK-NEXT: addc.cc.s64 %rd100, %rd57, 0;
+; CHECK-NEXT: or.b64 %rd62, %rd99, %rd100;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd56;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd61, %rd42, %r6;
+; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd62, %rd41, %r7;
-; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7;
+; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd64, %rd41, %r8;
+; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8;
; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd102, %rd64, %rd63, %p15;
-; CHECK-NEXT: shl.b64 %rd101, %rd41, %r6;
-; CHECK-NEXT: mov.b64 %rd92, %rd95;
+; CHECK-NEXT: selp.b64 %rd104, %rd66, %rd65, %p15;
+; CHECK-NEXT: shl.b64 %rd103, %rd41, %r6;
+; CHECK-NEXT: mov.b64 %rd94, %rd97;
; CHECK-NEXT: @%p14 bra $L__BB5_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd97;
-; CHECK-NEXT: shr.u64 %rd67, %rd41, %r9;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd99;
+; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd68, %rd42, %r10;
-; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10;
+; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd70, %rd42, %r11;
+; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11;
; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd99, %rd70, %rd69, %p16;
-; CHECK-NEXT: shr.u64 %rd100, %rd42, %r9;
+; CHECK-NEXT: selp.b64 %rd101, %rd72, %rd71, %p16;
+; CHECK-NEXT: shr.u64 %rd102, %rd42, %r9;
; CHECK-NEXT: add.cc.s64 %rd33, %rd43, -1;
; CHECK-NEXT: addc.cc.s64 %rd34, %rd44, -1;
-; CHECK-NEXT: mov.b64 %rd92, 0;
-; CHECK-NEXT: mov.b64 %rd95, %rd92;
+; CHECK-NEXT: mov.b64 %rd94, 0;
+; CHECK-NEXT: mov.b64 %rd97, %rd94;
; CHECK-NEXT: $L__BB5_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd71, %rd99, 63;
-; CHECK-NEXT: shl.b64 %rd72, %rd100, 1;
-; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT: shl.b64 %rd74, %rd99, 1;
-; CHECK-NEXT: shr.u64 %rd75, %rd102, 63;
-; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT: shr.u64 %rd77, %rd101, 63;
-; CHECK-NEXT: shl.b64 %rd78, %rd102, 1;
-; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT: shl.b64 %rd80, %rd101, 1;
-; CHECK-NEXT: or.b64 %rd101, %rd95, %rd80;
-; CHECK-NEXT: or.b64 %rd102, %rd92, %rd79;
-; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT: shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT: and.b64 %rd95, %rd83, 1;
-; CHECK-NEXT: and.b64 %rd84, %rd83, %rd43;
-; CHECK-NEXT: and.b64 %rd85, %rd83, %rd44;
-; CHECK-NEXT: sub.cc.s64 %rd99, %rd76, %rd84;
-; CHECK-NEXT: subc.cc.s64 %rd100, %rd73, %rd85;
-; CHECK-NEXT: add.cc.s64 %rd97, %rd97, -1;
-; CHECK-NEXT: addc.cc.s64 %rd98, %rd98, -1;
-; CHECK-NEXT: or.b64 %rd86, %rd97, %rd98;
-; CHECK-NEXT: setp.eq.b64 %p17, %rd86, 0;
+; CHECK-NEXT: shr.u64 %rd73, %rd101, 63;
+; CHECK-NEXT: shl.b64 %rd74, %rd102, 1;
+; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT: shl.b64 %rd76, %rd101, 1;
+; CHECK-NEXT: shr.u64 %rd77, %rd104, 63;
+; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT: shr.u64 %rd79, %rd103, 63;
+; CHECK-NEXT: shl.b64 %rd80, %rd104, 1;
+; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT: shl.b64 %rd82, %rd103, 1;
+; CHECK-NEXT: or.b64 %rd103, %rd97, %rd82;
+; CHECK-NEXT: or.b64 %rd104, %rd94, %rd81;
+; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT: shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT: and.b64 %rd97, %rd85, 1;
+; CHECK-NEXT: and.b64 %rd86, %rd85, %rd43;
+; CHECK-NEXT: and.b64 %rd87, %rd85, %rd44;
+; CHECK-NEXT: sub.cc.s64 %rd101, %rd78, %rd86;
+; CHECK-NEXT: subc.cc.s64 %rd102, %rd75, %rd87;
+; CHECK-NEXT: add.cc.s64 %rd99, %rd99, -1;
+; CHECK-NEXT: addc.cc.s64 %rd100, %rd100, -1;
+; CHECK-NEXT: or.b64 %rd88, %rd99, %rd100;
+; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0;
; CHECK-NEXT: @%p17 bra $L__BB5_4;
; CHECK-NEXT: bra.uni $L__BB5_2;
; CHECK-NEXT: $L__BB5_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd87, %rd101, 63;
-; CHECK-NEXT: shl.b64 %rd88, %rd102, 1;
-; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT: shl.b64 %rd90, %rd101, 1;
-; CHECK-NEXT: or.b64 %rd103, %rd95, %rd90;
-; CHECK-NEXT: or.b64 %rd104, %rd92, %rd89;
+; CHECK-NEXT: shr.u64 %rd89, %rd103, 63;
+; CHECK-NEXT: shl.b64 %rd90, %rd104, 1;
+; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT: shl.b64 %rd92, %rd103, 1;
+; CHECK-NEXT: or.b64 %rd105, %rd97, %rd92;
+; CHECK-NEXT: or.b64 %rd106, %rd94, %rd91;
; CHECK-NEXT: $L__BB5_5: // %udiv-end
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd103, %rd104};
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106};
; CHECK-NEXT: ret;
%div = udiv i128 %lhs, %rhs
ret i128 %div
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 9891e33..da99cec 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -2044,7 +2044,7 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
; O0-LABEL: test_srem_v4i8(
; O0: {
; O0-NEXT: .reg .b16 %rs<13>;
-; O0-NEXT: .reg .b32 %r<18>;
+; O0-NEXT: .reg .b32 %r<16>;
; O0-NEXT: .reg .b64 %rd<4>;
; O0-EMPTY:
; O0-NEXT: // %bb.0: // %entry
@@ -2066,27 +2066,25 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
; O0-NEXT: rem.s16 %rs6, %rs5, %rs4;
; O0-NEXT: cvt.u32.u16 %r8, %rs6;
; O0-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U;
-; O0-NEXT: prmt.b32 %r10, %r2, 0, 0x9991U;
-; O0-NEXT: cvt.u16.u32 %rs7, %r10;
-; O0-NEXT: prmt.b32 %r11, %r1, 0, 0x9991U;
-; O0-NEXT: cvt.u16.u32 %rs8, %r11;
+; O0-NEXT: cvt.s8.s32 %rs7, %r2;
+; O0-NEXT: cvt.s8.s32 %rs8, %r1;
; O0-NEXT: rem.s16 %rs9, %rs8, %rs7;
-; O0-NEXT: cvt.u32.u16 %r12, %rs9;
-; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x8880U;
-; O0-NEXT: cvt.u16.u32 %rs10, %r13;
-; O0-NEXT: prmt.b32 %r14, %r1, 0, 0x8880U;
-; O0-NEXT: cvt.u16.u32 %rs11, %r14;
+; O0-NEXT: cvt.u32.u16 %r10, %rs9;
+; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x9991U;
+; O0-NEXT: cvt.u16.u32 %rs10, %r11;
+; O0-NEXT: prmt.b32 %r12, %r1, 0, 0x9991U;
+; O0-NEXT: cvt.u16.u32 %rs11, %r12;
; O0-NEXT: rem.s16 %rs12, %rs11, %rs10;
-; O0-NEXT: cvt.u32.u16 %r15, %rs12;
-; O0-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U;
-; O0-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U;
-; O0-NEXT: st.b32 [%rd3], %r17;
+; O0-NEXT: cvt.u32.u16 %r13, %rs12;
+; O0-NEXT: prmt.b32 %r14, %r10, %r13, 0x3340U;
+; O0-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; O0-NEXT: st.b32 [%rd3], %r15;
; O0-NEXT: ret;
;
; O3-LABEL: test_srem_v4i8(
; O3: {
; O3-NEXT: .reg .b16 %rs<13>;
-; O3-NEXT: .reg .b32 %r<18>;
+; O3-NEXT: .reg .b32 %r<16>;
; O3-NEXT: .reg .b64 %rd<4>;
; O3-EMPTY:
; O3-NEXT: // %bb.0: // %entry
@@ -2108,21 +2106,19 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
; O3-NEXT: rem.s16 %rs6, %rs5, %rs4;
; O3-NEXT: cvt.u32.u16 %r8, %rs6;
; O3-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U;
-; O3-NEXT: prmt.b32 %r10, %r2, 0, 0x9991U;
-; O3-NEXT: cvt.u16.u32 %rs7, %r10;
-; O3-NEXT: prmt.b32 %r11, %r1, 0, 0x9991U;
-; O3-NEXT: cvt.u16.u32 %rs8, %r11;
+; O3-NEXT: cvt.s8.s32 %rs7, %r2;
+; O3-NEXT: cvt.s8.s32 %rs8, %r1;
; O3-NEXT: rem.s16 %rs9, %rs8, %rs7;
-; O3-NEXT: cvt.u32.u16 %r12, %rs9;
-; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x8880U;
-; O3-NEXT: cvt.u16.u32 %rs10, %r13;
-; O3-NEXT: prmt.b32 %r14, %r1, 0, 0x8880U;
-; O3-NEXT: cvt.u16.u32 %rs11, %r14;
+; O3-NEXT: cvt.u32.u16 %r10, %rs9;
+; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x9991U;
+; O3-NEXT: cvt.u16.u32 %rs10, %r11;
+; O3-NEXT: prmt.b32 %r12, %r1, 0, 0x9991U;
+; O3-NEXT: cvt.u16.u32 %rs11, %r12;
; O3-NEXT: rem.s16 %rs12, %rs11, %rs10;
-; O3-NEXT: cvt.u32.u16 %r15, %rs12;
-; O3-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U;
-; O3-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U;
-; O3-NEXT: st.b32 [%rd3], %r17;
+; O3-NEXT: cvt.u32.u16 %r13, %rs12;
+; O3-NEXT: prmt.b32 %r14, %r10, %r13, 0x3340U;
+; O3-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; O3-NEXT: st.b32 [%rd3], %r15;
; O3-NEXT: ret;
entry:
%t57 = load <4 x i8>, ptr %a, align 4
@@ -2142,7 +2138,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
; O0-LABEL: test_srem_v3i8(
; O0: {
; O0-NEXT: .reg .b16 %rs<20>;
-; O0-NEXT: .reg .b32 %r<14>;
+; O0-NEXT: .reg .b32 %r<8>;
; O0-NEXT: .reg .b64 %rd<4>;
; O0-EMPTY:
; O0-NEXT: // %bb.0: // %entry
@@ -2161,25 +2157,19 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
; O0-NEXT: or.b16 %rs9, %rs8, %rs6;
; O0-NEXT: cvt.u32.u16 %r2, %rs9;
; O0-NEXT: ld.s8 %rs10, [%rd2+2];
-; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x9991U;
-; O0-NEXT: cvt.u16.u32 %rs11, %r3;
-; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x9991U;
-; O0-NEXT: cvt.u16.u32 %rs12, %r4;
+; O0-NEXT: cvt.s16.s8 %rs11, %rs9;
+; O0-NEXT: cvt.s16.s8 %rs12, %rs4;
; O0-NEXT: rem.s16 %rs13, %rs12, %rs11;
-; O0-NEXT: cvt.u32.u16 %r5, %rs13;
-; O0-NEXT: prmt.b32 %r6, %r2, 0, 0x8880U;
-; O0-NEXT: cvt.u16.u32 %rs14, %r6;
-; O0-NEXT: prmt.b32 %r7, %r1, 0, 0x8880U;
-; O0-NEXT: cvt.u16.u32 %rs15, %r7;
+; O0-NEXT: cvt.u32.u16 %r3, %rs13;
+; O0-NEXT: prmt.b32 %r4, %r2, 0, 0x9991U;
+; O0-NEXT: cvt.u16.u32 %rs14, %r4;
+; O0-NEXT: prmt.b32 %r5, %r1, 0, 0x9991U;
+; O0-NEXT: cvt.u16.u32 %rs15, %r5;
; O0-NEXT: rem.s16 %rs16, %rs15, %rs14;
-; O0-NEXT: cvt.u32.u16 %r8, %rs16;
-; O0-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U;
-; O0-NEXT: // implicit-def: %r11
-; O0-NEXT: // implicit-def: %r12
-; O0-NEXT: prmt.b32 %r10, %r11, %r12, 0x3340U;
-; O0-NEXT: prmt.b32 %r13, %r9, %r10, 0x5410U;
+; O0-NEXT: cvt.u32.u16 %r6, %rs16;
+; O0-NEXT: prmt.b32 %r7, %r3, %r6, 0x3340U;
; O0-NEXT: rem.s16 %rs17, %rs5, %rs10;
-; O0-NEXT: cvt.u16.u32 %rs18, %r13;
+; O0-NEXT: cvt.u16.u32 %rs18, %r7;
; O0-NEXT: st.b8 [%rd3], %rs18;
; O0-NEXT: shr.u16 %rs19, %rs18, 8;
; O0-NEXT: st.b8 [%rd3+1], %rs19;
@@ -2189,7 +2179,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
; O3-LABEL: test_srem_v3i8(
; O3: {
; O3-NEXT: .reg .b16 %rs<20>;
-; O3-NEXT: .reg .b32 %r<14>;
+; O3-NEXT: .reg .b32 %r<8>;
; O3-NEXT: .reg .b64 %rd<4>;
; O3-EMPTY:
; O3-NEXT: // %bb.0: // %entry
@@ -2208,24 +2198,20 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
; O3-NEXT: cvt.u32.u16 %r2, %rs9;
; O3-NEXT: ld.s8 %rs10, [%rd2+2];
; O3-NEXT: ld.param.b64 %rd3, [test_srem_v3i8_param_2];
-; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x9991U;
-; O3-NEXT: cvt.u16.u32 %rs11, %r3;
-; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x9991U;
-; O3-NEXT: cvt.u16.u32 %rs12, %r4;
+; O3-NEXT: cvt.s16.s8 %rs11, %rs9;
+; O3-NEXT: cvt.s16.s8 %rs12, %rs4;
; O3-NEXT: rem.s16 %rs13, %rs12, %rs11;
-; O3-NEXT: cvt.u32.u16 %r5, %rs13;
-; O3-NEXT: prmt.b32 %r6, %r2, 0, 0x8880U;
-; O3-NEXT: cvt.u16.u32 %rs14, %r6;
-; O3-NEXT: prmt.b32 %r7, %r1, 0, 0x8880U;
-; O3-NEXT: cvt.u16.u32 %rs15, %r7;
+; O3-NEXT: cvt.u32.u16 %r3, %rs13;
+; O3-NEXT: prmt.b32 %r4, %r2, 0, 0x9991U;
+; O3-NEXT: cvt.u16.u32 %rs14, %r4;
+; O3-NEXT: prmt.b32 %r5, %r1, 0, 0x9991U;
+; O3-NEXT: cvt.u16.u32 %rs15, %r5;
; O3-NEXT: rem.s16 %rs16, %rs15, %rs14;
-; O3-NEXT: cvt.u32.u16 %r8, %rs16;
-; O3-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U;
-; O3-NEXT: prmt.b32 %r10, %r11, %r12, 0x3340U;
-; O3-NEXT: prmt.b32 %r13, %r9, %r10, 0x5410U;
+; O3-NEXT: cvt.u32.u16 %r6, %rs16;
+; O3-NEXT: prmt.b32 %r7, %r3, %r6, 0x3340U;
; O3-NEXT: rem.s16 %rs17, %rs5, %rs10;
; O3-NEXT: st.b8 [%rd3+2], %rs17;
-; O3-NEXT: cvt.u16.u32 %rs18, %r13;
+; O3-NEXT: cvt.u16.u32 %rs18, %r7;
; O3-NEXT: st.b8 [%rd3], %rs18;
; O3-NEXT: shr.u16 %rs19, %rs18, 8;
; O3-NEXT: st.b8 [%rd3+1], %rs19;
@@ -2340,23 +2326,22 @@ define <4 x float> @test_sitofp_v4i8(<4 x i8> %a) {
; CHECK-LABEL: test_sitofp_v4i8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_sitofp_v4i8_param_0];
-; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0xbbb3U;
-; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT: cvt.rn.f32.s16 %r3, %rs1;
-; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U;
-; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
-; CHECK-NEXT: cvt.rn.f32.s16 %r5, %rs2;
-; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
-; CHECK-NEXT: cvt.u16.u32 %rs3, %r6;
-; CHECK-NEXT: cvt.rn.f32.s16 %r7, %rs3;
-; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x8880U;
-; CHECK-NEXT: cvt.u16.u32 %rs4, %r8;
-; CHECK-NEXT: cvt.rn.f32.s16 %r9, %rs4;
-; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r9, %r7, %r5, %r3};
+; CHECK-NEXT: cvt.s8.s32 %rs1, %r1;
+; CHECK-NEXT: cvt.rn.f32.s16 %r2, %rs1;
+; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0xbbb3U;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r3;
+; CHECK-NEXT: cvt.rn.f32.s16 %r4, %rs2;
+; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xaaa2U;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: cvt.rn.f32.s16 %r6, %rs3;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x9991U;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r7;
+; CHECK-NEXT: cvt.rn.f32.s16 %r8, %rs4;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r2, %r8, %r6, %r4};
; CHECK-NEXT: ret;
%r = sitofp <4 x i8> %a to <4 x float>
ret <4 x float> %r
diff --git a/llvm/test/CodeGen/NVPTX/pr126337.ll b/llvm/test/CodeGen/NVPTX/pr126337.ll
index 32e4115..95258f7 100644
--- a/llvm/test/CodeGen/NVPTX/pr126337.ll
+++ b/llvm/test/CodeGen/NVPTX/pr126337.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 | %ptxas -arch=sm_70 -c - %}
; This IR should compile without triggering assertions in LICM
; when the CopyToReg from %0 in the first BB gets eliminated
diff --git a/llvm/test/CodeGen/NVPTX/tanhf.ll b/llvm/test/CodeGen/NVPTX/tanhf.ll
new file mode 100644
index 0000000..6f4eb22
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tanhf.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define float @test1(float %in) local_unnamed_addr {
+; CHECK-LABEL: test1(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test1_param_0];
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %call = call afn float @llvm.tanh.f32(float %in)
+ ret float %call
+}
+
+define half @test2(half %in) local_unnamed_addr {
+; CHECK-LABEL: test2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [test2_param_0];
+; CHECK-NEXT: cvt.f32.f16 %r1, %rs1;
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+ %call = call afn half @llvm.tanh.f16(half %in)
+ ret half %call
+}
+
+declare float @llvm.tanh.f32(float)
+declare half @llvm.tanh.f16(half)
+
diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py
new file mode 100644
index 0000000..8f50206
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py
@@ -0,0 +1,14 @@
+# Check all variants of instructions supported by PTX78 on SM90
+# RUN: %python %s --ptx=78 --gpu-arch=90 --aa > %t-ptx78-sm_90.ll
+# RUN: FileCheck %t-ptx78-sm_90.ll < %t-ptx78-sm_90.ll \
+# RUN: --check-prefixes=PTX78STMATRIX-DAG
+# RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \
+# RUN: | FileCheck %t-ptx78-sm_90.ll
+# RUN: %if ptxas-12.7 %{ \
+# RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \
+# RUN: | %ptxas-verify -arch=sm_90 \
+# RUN: %}
+
+import wmma
+
+wmma.main()
diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py
index 6ad0a2a..5c14a54 100644
--- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py
+++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py
@@ -1,9 +1,7 @@
# Check all variants of instructions supported by PTX86 on SM100a
# RUN: %python %s --ptx=86 --gpu-arch=100 --aa > %t-ptx86-sm_100a.ll
# RUN: FileCheck %t-ptx86-sm_100a.ll < %t-ptx86-sm_100a.ll \
-# RUN: --check-prefixes=PTX86LDMATRIX-DAG
-# RUN: FileCheck %t-ptx86-sm_100a.ll < %t-ptx86-sm_100a.ll \
-# RUN: --check-prefixes=PTX86LDMATRIX-DAG
+# RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG
# RUN: llc < %t-ptx86-sm_100a.ll -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 \
# RUN: | FileCheck %t-ptx86-sm_100a.ll
# RUN: %if ptxas-12.7 %{ \
diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py
index 7d99534..a77f9ad 100644
--- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py
+++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py
@@ -1,9 +1,7 @@
# Check all variants of instructions supported by PTX86 on SM101a
# RUN: %python %s --ptx=86 --gpu-arch=101 --aa > %t-ptx86-sm_101a.ll
# RUN: FileCheck %t-ptx86-sm_101a.ll < %t-ptx86-sm_101a.ll \
-# RUN: --check-prefixes=PTX86LDMATRIX-DAG
-# RUN: FileCheck %t-ptx86-sm_101a.ll < %t-ptx86-sm_101a.ll \
-# RUN: --check-prefixes=PTX86LDMATRIX-DAG
+# RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG
# RUN: llc < %t-ptx86-sm_101a.ll -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 \
# RUN: | FileCheck %t-ptx86-sm_101a.ll
# RUN: %if ptxas-12.7 %{ \
diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py
index 7bddf0b..8126e64 100644
--- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py
+++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py
@@ -1,9 +1,7 @@
# Check all variants of instructions supported by PTX86 on SM120a
# RUN: %python %s --ptx=86 --gpu-arch=120 --aa > %t-ptx86-sm_120a.ll
# RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \
-# RUN: --check-prefixes=PTX86LDMATRIX-DAG
-# RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \
-# RUN: --check-prefixes=PTX86LDMATRIX-DAG
+# RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG
# RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \
# RUN: | FileCheck %t-ptx86-sm_120a.ll
# RUN: %if ptxas-12.7 %{ \
diff --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py
index 2ee4896..2eb3c3d 100644
--- a/llvm/test/CodeGen/NVPTX/wmma.py
+++ b/llvm/test/CodeGen/NVPTX/wmma.py
@@ -10,6 +10,7 @@ import argparse
from itertools import product
from string import Template
+
class MMAType:
def __init__(self, ptx_type):
self.ptx_type = ptx_type
@@ -176,6 +177,13 @@ class MMAFrag:
"m8n16:x1:b8x16.b4x16_p64": 1,
"m8n16:x2:b8x16.b4x16_p64": 2,
"m8n16:x4:b8x16.b4x16_p64": 4,
+ # stmatrix
+ "m8n8:x1:b16": 1,
+ "m8n8:x2:b16": 2,
+ "m8n8:x4:b16": 4,
+ "m16n8:x1:b8": 1,
+ "m16n8:x2:b8": 2,
+ "m16n8:x4:b8": 4,
}.get(
"%s:%s:%s" % (geom, frag, ptx_elt_type),
{
@@ -241,6 +249,13 @@ def make_ldmatrix_ops(geoms, frags, types):
]
+def make_stmatrix_ops(geoms, frags, types):
+ return [
+ MMAFrag(geom, frag, ptx_type)
+ for (geom, frag, ptx_type) in product(geoms, frags, types)
+ ]
+
+
def get_wmma_ops():
return (
make_mma_ops(["m16n16k8"], ["tf32"], [], ["f32"], [])
@@ -315,6 +330,12 @@ def get_ldmatrix_ops():
)
+def get_stmatrix_ops():
+ return make_stmatrix_ops(["m8n8"], ["x1", "x2", "x4"], ["b16"]) + make_stmatrix_ops(
+ ["m16n8"], ["x1", "x2", "x4"], ["b8"]
+ )
+
+
def is_wmma_geom_supported(geom):
# geometries for FP and ints.
if geom in ["m8n32k16", "m32n8k16"]:
@@ -360,6 +381,14 @@ def is_ldmatrix_geom_supported(geom):
assert False # Unexpected geometry.
+def is_stmatrix_geom_supported(geom):
+ if geom in ["m8n8"]:
+ return ptx_version >= 78 and gpu_arch >= 90
+ elif geom in ["m16n8"]:
+ return ptx_version >= 86 and gpu_arch >= 100 and aa
+ assert False # Unexpected geometry.
+
+
def is_ldmatrix_trans_supported(geom, trans):
if geom in ["m8n8"]:
return True
@@ -369,6 +398,15 @@ def is_ldmatrix_trans_supported(geom, trans):
return trans == ""
assert False # Unexpected geometry.
+
+def is_stmatrix_trans_supported(geom, trans):
+ if geom in ["m8n8"]:
+ return True
+ elif geom in ["m16n8"]:
+ return trans == ".trans"
+ assert False # Unexpected geometry.
+
+
def is_type_supported(ptx_type):
if ptx_type in ["s8", "u8", "s32"]:
return ptx_version >= 63 and gpu_arch >= 72
@@ -463,6 +501,16 @@ def is_ldmatrix_variant_supported(frag, trans):
return frag.frag in ["x1", "x2", "x4"]
+def is_stmatrix_variant_supported(frag, trans):
+ if not (
+ is_type_supported(frag.mma_type.ptx_type)
+ and is_stmatrix_geom_supported(frag.geom)
+ and is_stmatrix_trans_supported(frag.geom, trans)
+ ):
+ return False
+ return frag.frag in ["x1", "x2", "x4"]
+
+
def make_wmma_slice_ty(frag):
return [frag.mma_type.llvm_type] * frag.nregs
@@ -717,6 +765,65 @@ define ${ret_ty} @test_${function}_o(i8 ${as}* %src) {
return generated_items
+def gen_stmatrix_tests():
+ stmatrix_template = """
+declare void @${intrinsic}(i8 ${as}* %dst, ${args});
+
+; CHECK-LABEL: .func {{.*}}test_${function}(
+define void @test_${function}(i8 ${as}* %dst, ${args}) {
+; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}]
+; CHECK: {${check_args}}
+ call void @${intrinsic}(i8${as}* %dst, ${args});
+ ret void
+}
+
+; CHECK-LABEL: .func{{.*}}test_${function}_o(
+define void @test_${function}_o(i8 ${as}* %dst, ${args}) {
+; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}+128],
+; CHECK: {${check_args}}
+ %dst1 = getelementptr i8, i8 ${as}* %dst, i32 128;
+ call void @${intrinsic}(i8 ${as}* %dst1, ${args});
+ ret void
+}
+"""
+ intrinsic_template = (
+ "llvm.nvvm.stmatrix.sync.aligned.${geom}.${frag}${trans}.${itype}.${pspace}"
+ )
+ instruction_template = (
+ "stmatrix.sync.aligned.${geom}.${frag}${trans}${space}.${itype}"
+ )
+ generated_items = []
+
+ for frag, space, trans in product(
+ get_stmatrix_ops(),
+ ["", ".shared"],
+ ["", ".trans"],
+ ):
+ if not is_stmatrix_variant_supported(frag, trans):
+ continue
+
+ params = {
+ "frag": frag.frag,
+ "space": space,
+ "trans": trans,
+ "itype": frag.mma_type.ptx_type,
+ "pspace": get_pspace(space),
+ "as": "addrspace(%d)" % get_aspace(space),
+ "geom": frag.geom,
+ }
+
+ test_params = params
+ test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
+ test_params["function"] = test_params["intrinsic"].replace(".", "_")
+ test_params["instruction"] = Template(instruction_template).substitute(params)
+ test_params["args"] = make_wmma_slice_args(frag)
+ test_params["check_args"] = check_pattern(frag)
+
+ print(Template(stmatrix_template).substitute(test_params))
+ generated_items.append((test_params["intrinsic"], test_params["instruction"]))
+
+ return generated_items
+
def mma_signature(op):
if op.a.mma_type.ptx_type == "f16":
# FP16 ops identified by accumulator & result type.
@@ -893,6 +1000,7 @@ def gen_check_unsupported_ops(items):
; NOALTFLOAT-NOT: .{{bf16|tf32}}
; NODOUBLE-NOT: .f64
; NOLDMATRIX-NOT: ldmatrix.sync.aligned
+; NOSTMATRIX-NOT: stmatrix.sync.aligned
; M16N16-DAG: m16n16k16.load.{{[ab].*}}.f16.p
; M16N16-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
@@ -994,6 +1102,26 @@ def gen_check_unsupported_ops(items):
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x4.b8x16.b6x16_p32
; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x4.b8x16.b4x16_p64
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.trans.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.trans.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.trans.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.shared.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.shared.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.shared.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.trans.shared.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.trans.shared.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.trans.shared.b16
+
+; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x1.trans.b8
+; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x2.trans.b8
+; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x4.trans.b8
+; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x1.trans.shared.b8
+; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x2.trans.shared.b8
+; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x4.trans.shared.b8
+
; PTX71MMA-DAG: mma.m8n8k4.row.col.f64
; PTX71MMA-DAG: mma.m16n8k4.row.col.tf32
; PTX71MMA-DAG: mma.m16n8k8.row.col.tf32
@@ -1039,6 +1167,7 @@ def gen_tests():
items = gen_wmma_load_tests()
items += gen_wmma_store_tests()
items += gen_ldmatrix_tests()
+ items += gen_stmatrix_tests()
items += gen_wmma_mma_tests()
items += gen_mma_tests()
gen_check_unsupported_ops(items)
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 821cfd0..b540948 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -764,8 +764,13 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
;
; CHECK-PWR7-LABEL: sub_absv_8_ext:
; CHECK-PWR7: # %bb.0: # %entry
-; CHECK-PWR7-NEXT: stdu r1, -448(r1)
-; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 448
+; CHECK-PWR7-NEXT: stdu r1, -512(r1)
+; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 512
+; CHECK-PWR7-NEXT: .cfi_offset r14, -144
+; CHECK-PWR7-NEXT: .cfi_offset r15, -136
+; CHECK-PWR7-NEXT: .cfi_offset r16, -128
+; CHECK-PWR7-NEXT: .cfi_offset r17, -120
+; CHECK-PWR7-NEXT: .cfi_offset r18, -112
; CHECK-PWR7-NEXT: .cfi_offset r19, -104
; CHECK-PWR7-NEXT: .cfi_offset r20, -96
; CHECK-PWR7-NEXT: .cfi_offset r21, -88
@@ -778,258 +783,244 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
; CHECK-PWR7-NEXT: .cfi_offset r28, -32
; CHECK-PWR7-NEXT: .cfi_offset r29, -24
; CHECK-PWR7-NEXT: .cfi_offset r30, -16
-; CHECK-PWR7-NEXT: addi r3, r1, 304
-; CHECK-PWR7-NEXT: std r19, 344(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r20, 352(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r21, 360(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r22, 368(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r23, 376(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r24, 384(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r25, 392(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r26, 400(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r27, 408(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r28, 416(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r29, 424(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r30, 432(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT: .cfi_offset r31, -8
+; CHECK-PWR7-NEXT: .cfi_offset r2, -152
; CHECK-PWR7-NEXT: addi r3, r1, 320
-; CHECK-PWR7-NEXT: lbz r7, 304(r1)
-; CHECK-PWR7-NEXT: stxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: lbz r8, 320(r1)
-; CHECK-PWR7-NEXT: lbz r9, 305(r1)
-; CHECK-PWR7-NEXT: lbz r10, 321(r1)
-; CHECK-PWR7-NEXT: lbz r26, 325(r1)
-; CHECK-PWR7-NEXT: clrlwi r7, r7, 24
-; CHECK-PWR7-NEXT: clrlwi r8, r8, 24
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r10, r10, 24
-; CHECK-PWR7-NEXT: lbz r11, 306(r1)
-; CHECK-PWR7-NEXT: lbz r12, 322(r1)
-; CHECK-PWR7-NEXT: lbz r23, 314(r1)
-; CHECK-PWR7-NEXT: clrlwi r22, r26, 24
-; CHECK-PWR7-NEXT: lbz r26, 330(r1)
-; CHECK-PWR7-NEXT: sub r8, r7, r8
-; CHECK-PWR7-NEXT: lbz r7, 315(r1)
-; CHECK-PWR7-NEXT: sub r20, r9, r10
-; CHECK-PWR7-NEXT: lbz r9, 331(r1)
-; CHECK-PWR7-NEXT: lbz r0, 307(r1)
-; CHECK-PWR7-NEXT: lbz r30, 323(r1)
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: clrlwi r12, r12, 24
-; CHECK-PWR7-NEXT: clrlwi r23, r23, 24
-; CHECK-PWR7-NEXT: clrlwi r21, r26, 24
-; CHECK-PWR7-NEXT: clrlwi r7, r7, 24
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r0, r0, 24
-; CHECK-PWR7-NEXT: clrlwi r30, r30, 24
-; CHECK-PWR7-NEXT: lbz r29, 308(r1)
-; CHECK-PWR7-NEXT: lbz r28, 324(r1)
-; CHECK-PWR7-NEXT: lbz r27, 309(r1)
-; CHECK-PWR7-NEXT: lbz r25, 310(r1)
-; CHECK-PWR7-NEXT: lbz r24, 326(r1)
-; CHECK-PWR7-NEXT: sub r19, r11, r12
-; CHECK-PWR7-NEXT: sub r11, r23, r21
-; CHECK-PWR7-NEXT: sub r9, r7, r9
-; CHECK-PWR7-NEXT: sub r26, r0, r30
-; CHECK-PWR7-NEXT: srawi r12, r11, 31
-; CHECK-PWR7-NEXT: srawi r0, r9, 31
-; CHECK-PWR7-NEXT: lbz r3, 312(r1)
-; CHECK-PWR7-NEXT: clrlwi r29, r29, 24
-; CHECK-PWR7-NEXT: clrlwi r28, r28, 24
-; CHECK-PWR7-NEXT: clrlwi r27, r27, 24
-; CHECK-PWR7-NEXT: clrlwi r25, r25, 24
-; CHECK-PWR7-NEXT: clrlwi r24, r24, 24
-; CHECK-PWR7-NEXT: xor r11, r11, r12
-; CHECK-PWR7-NEXT: xor r9, r9, r0
-; CHECK-PWR7-NEXT: sub r28, r29, r28
-; CHECK-PWR7-NEXT: sub r30, r27, r22
-; CHECK-PWR7-NEXT: sub r29, r25, r24
-; CHECK-PWR7-NEXT: sub r27, r11, r12
-; CHECK-PWR7-NEXT: sub r24, r9, r0
-; CHECK-PWR7-NEXT: lbz r9, 316(r1)
-; CHECK-PWR7-NEXT: lbz r11, 332(r1)
-; CHECK-PWR7-NEXT: lbz r4, 328(r1)
-; CHECK-PWR7-NEXT: lbz r5, 311(r1)
-; CHECK-PWR7-NEXT: lbz r6, 327(r1)
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: clrlwi r3, r3, 24
-; CHECK-PWR7-NEXT: clrlwi r4, r4, 24
-; CHECK-PWR7-NEXT: clrlwi r5, r5, 24
-; CHECK-PWR7-NEXT: clrlwi r6, r6, 24
-; CHECK-PWR7-NEXT: sub r3, r3, r4
+; CHECK-PWR7-NEXT: std r14, 368(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r15, 376(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r16, 384(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r17, 392(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r18, 400(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r19, 408(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r20, 416(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r21, 424(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r22, 432(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r23, 440(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r24, 448(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r25, 456(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r26, 464(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r27, 472(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r28, 480(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r29, 488(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r30, 496(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r31, 504(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r2, 360(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT: lbz r3, 320(r1)
+; CHECK-PWR7-NEXT: addi r4, r1, 336
+; CHECK-PWR7-NEXT: stw r3, 60(r1) # 4-byte Folded Spill
+; CHECK-PWR7-NEXT: stxvw4x v3, 0, r4
+; CHECK-PWR7-NEXT: lbz r15, 334(r1)
+; CHECK-PWR7-NEXT: lbz r14, 350(r1)
+; CHECK-PWR7-NEXT: lbz r31, 335(r1)
+; CHECK-PWR7-NEXT: lbz r2, 351(r1)
+; CHECK-PWR7-NEXT: sub r15, r15, r14
+; CHECK-PWR7-NEXT: sub r14, r31, r2
+; CHECK-PWR7-NEXT: srawi r2, r14, 31
+; CHECK-PWR7-NEXT: xor r14, r14, r2
+; CHECK-PWR7-NEXT: lbz r3, 333(r1)
+; CHECK-PWR7-NEXT: lbz r19, 331(r1)
+; CHECK-PWR7-NEXT: lbz r18, 347(r1)
+; CHECK-PWR7-NEXT: sub r19, r19, r18
+; CHECK-PWR7-NEXT: lbz r17, 332(r1)
+; CHECK-PWR7-NEXT: lbz r16, 348(r1)
+; CHECK-PWR7-NEXT: sub r17, r17, r16
+; CHECK-PWR7-NEXT: lbz r23, 329(r1)
+; CHECK-PWR7-NEXT: sub r14, r14, r2
+; CHECK-PWR7-NEXT: lbz r2, 349(r1)
+; CHECK-PWR7-NEXT: lbz r22, 345(r1)
+; CHECK-PWR7-NEXT: lbz r4, 336(r1)
+; CHECK-PWR7-NEXT: lbz r5, 321(r1)
+; CHECK-PWR7-NEXT: lbz r6, 337(r1)
+; CHECK-PWR7-NEXT: lbz r7, 322(r1)
+; CHECK-PWR7-NEXT: lbz r8, 338(r1)
+; CHECK-PWR7-NEXT: lbz r9, 323(r1)
+; CHECK-PWR7-NEXT: lbz r10, 339(r1)
+; CHECK-PWR7-NEXT: lbz r11, 324(r1)
+; CHECK-PWR7-NEXT: lbz r12, 340(r1)
+; CHECK-PWR7-NEXT: lbz r0, 325(r1)
+; CHECK-PWR7-NEXT: lbz r30, 341(r1)
+; CHECK-PWR7-NEXT: lbz r29, 326(r1)
+; CHECK-PWR7-NEXT: lbz r28, 342(r1)
+; CHECK-PWR7-NEXT: lbz r27, 327(r1)
+; CHECK-PWR7-NEXT: lbz r26, 343(r1)
+; CHECK-PWR7-NEXT: sub r3, r3, r2
+; CHECK-PWR7-NEXT: lbz r25, 328(r1)
+; CHECK-PWR7-NEXT: lbz r24, 344(r1)
+; CHECK-PWR7-NEXT: lbz r21, 330(r1)
+; CHECK-PWR7-NEXT: lbz r20, 346(r1)
; CHECK-PWR7-NEXT: sub r5, r5, r6
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: srawi r4, r3, 31
+; CHECK-PWR7-NEXT: srawi r18, r3, 31
+; CHECK-PWR7-NEXT: sub r7, r7, r8
+; CHECK-PWR7-NEXT: sub r9, r9, r10
+; CHECK-PWR7-NEXT: sub r11, r11, r12
+; CHECK-PWR7-NEXT: sub r0, r0, r30
+; CHECK-PWR7-NEXT: sub r29, r29, r28
+; CHECK-PWR7-NEXT: sub r27, r27, r26
+; CHECK-PWR7-NEXT: sub r25, r25, r24
+; CHECK-PWR7-NEXT: srawi r31, r15, 31
+; CHECK-PWR7-NEXT: ld r2, 360(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: xor r3, r3, r18
; CHECK-PWR7-NEXT: srawi r6, r5, 31
-; CHECK-PWR7-NEXT: xor r3, r3, r4
-; CHECK-PWR7-NEXT: sldi r27, r27, 56
-; CHECK-PWR7-NEXT: xor r5, r5, r6
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: sub r3, r3, r4
-; CHECK-PWR7-NEXT: sldi r24, r24, 56
+; CHECK-PWR7-NEXT: srawi r8, r7, 31
+; CHECK-PWR7-NEXT: srawi r10, r9, 31
+; CHECK-PWR7-NEXT: srawi r12, r11, 31
+; CHECK-PWR7-NEXT: srawi r30, r0, 31
+; CHECK-PWR7-NEXT: sub r3, r3, r18
+; CHECK-PWR7-NEXT: srawi r18, r19, 31
+; CHECK-PWR7-NEXT: srawi r28, r29, 31
+; CHECK-PWR7-NEXT: ld r16, 384(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: srawi r11, r9, 31
-; CHECK-PWR7-NEXT: std r27, 208(r1)
-; CHECK-PWR7-NEXT: sub r4, r5, r6
-; CHECK-PWR7-NEXT: std r27, 216(r1)
-; CHECK-PWR7-NEXT: srawi r27, r29, 31
-; CHECK-PWR7-NEXT: lbz r10, 313(r1)
-; CHECK-PWR7-NEXT: xor r9, r9, r11
-; CHECK-PWR7-NEXT: std r24, 224(r1)
-; CHECK-PWR7-NEXT: lbz r22, 329(r1)
-; CHECK-PWR7-NEXT: std r24, 232(r1)
-; CHECK-PWR7-NEXT: srawi r24, r30, 31
-; CHECK-PWR7-NEXT: ld r21, 360(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sub r23, r9, r11
-; CHECK-PWR7-NEXT: lbz r9, 317(r1)
-; CHECK-PWR7-NEXT: lbz r11, 333(r1)
-; CHECK-PWR7-NEXT: xor r29, r29, r27
-; CHECK-PWR7-NEXT: std r3, 176(r1)
-; CHECK-PWR7-NEXT: std r3, 184(r1)
-; CHECK-PWR7-NEXT: sldi r3, r4, 56
-; CHECK-PWR7-NEXT: sldi r23, r23, 56
-; CHECK-PWR7-NEXT: xor r30, r30, r24
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: sub r4, r30, r24
-; CHECK-PWR7-NEXT: ld r30, 432(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: std r3, 160(r1)
-; CHECK-PWR7-NEXT: std r3, 168(r1)
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: sub r3, r29, r27
-; CHECK-PWR7-NEXT: std r23, 240(r1)
-; CHECK-PWR7-NEXT: ld r29, 424(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r11, r9, 31
-; CHECK-PWR7-NEXT: std r23, 248(r1)
-; CHECK-PWR7-NEXT: ld r27, 408(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r23, r28, 31
+; CHECK-PWR7-NEXT: srawi r26, r27, 31
+; CHECK-PWR7-NEXT: srawi r24, r25, 31
+; CHECK-PWR7-NEXT: xor r19, r19, r18
+; CHECK-PWR7-NEXT: xor r15, r15, r31
+; CHECK-PWR7-NEXT: xor r5, r5, r6
+; CHECK-PWR7-NEXT: std r3, 272(r1)
+; CHECK-PWR7-NEXT: std r3, 280(r1)
+; CHECK-PWR7-NEXT: srawi r3, r17, 31
+; CHECK-PWR7-NEXT: sub r19, r19, r18
+; CHECK-PWR7-NEXT: xor r7, r7, r8
+; CHECK-PWR7-NEXT: sub r15, r15, r31
+; CHECK-PWR7-NEXT: xor r17, r17, r3
+; CHECK-PWR7-NEXT: xor r9, r9, r10
+; CHECK-PWR7-NEXT: xor r11, r11, r12
+; CHECK-PWR7-NEXT: xor r0, r0, r30
+; CHECK-PWR7-NEXT: xor r29, r29, r28
+; CHECK-PWR7-NEXT: xor r27, r27, r26
+; CHECK-PWR7-NEXT: sub r3, r17, r3
+; CHECK-PWR7-NEXT: xor r25, r25, r24
+; CHECK-PWR7-NEXT: sub r25, r25, r24
+; CHECK-PWR7-NEXT: sub r27, r27, r26
+; CHECK-PWR7-NEXT: sub r29, r29, r28
; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: xor r28, r28, r23
-; CHECK-PWR7-NEXT: xor r9, r9, r11
-; CHECK-PWR7-NEXT: std r3, 144(r1)
-; CHECK-PWR7-NEXT: ld r24, 384(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: std r3, 152(r1)
-; CHECK-PWR7-NEXT: sldi r3, r4, 56
-; CHECK-PWR7-NEXT: sub r25, r9, r11
-; CHECK-PWR7-NEXT: lbz r9, 318(r1)
-; CHECK-PWR7-NEXT: lbz r11, 334(r1)
-; CHECK-PWR7-NEXT: std r3, 128(r1)
+; CHECK-PWR7-NEXT: sub r0, r0, r30
+; CHECK-PWR7-NEXT: sub r11, r11, r12
+; CHECK-PWR7-NEXT: sub r9, r9, r10
+; CHECK-PWR7-NEXT: sub r7, r7, r8
+; CHECK-PWR7-NEXT: sub r5, r5, r6
+; CHECK-PWR7-NEXT: sldi r14, r14, 56
+; CHECK-PWR7-NEXT: sldi r15, r15, 56
+; CHECK-PWR7-NEXT: ld r31, 504(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r3, 256(r1)
+; CHECK-PWR7-NEXT: std r3, 264(r1)
+; CHECK-PWR7-NEXT: sldi r3, r19, 56
; CHECK-PWR7-NEXT: sldi r25, r25, 56
-; CHECK-PWR7-NEXT: std r3, 136(r1)
-; CHECK-PWR7-NEXT: sub r3, r28, r23
+; CHECK-PWR7-NEXT: sldi r27, r27, 56
+; CHECK-PWR7-NEXT: std r3, 240(r1)
+; CHECK-PWR7-NEXT: std r3, 248(r1)
+; CHECK-PWR7-NEXT: sub r3, r23, r22
+; CHECK-PWR7-NEXT: srawi r23, r3, 31
+; CHECK-PWR7-NEXT: sub r22, r21, r20
+; CHECK-PWR7-NEXT: srawi r21, r22, 31
+; CHECK-PWR7-NEXT: sldi r29, r29, 56
+; CHECK-PWR7-NEXT: sldi r0, r0, 56
+; CHECK-PWR7-NEXT: sldi r11, r11, 56
+; CHECK-PWR7-NEXT: xor r3, r3, r23
+; CHECK-PWR7-NEXT: xor r22, r22, r21
+; CHECK-PWR7-NEXT: sldi r9, r9, 56
+; CHECK-PWR7-NEXT: sldi r7, r7, 56
+; CHECK-PWR7-NEXT: sldi r5, r5, 56
+; CHECK-PWR7-NEXT: ld r30, 496(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r28, 480(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sub r3, r3, r23
+; CHECK-PWR7-NEXT: sub r22, r22, r21
+; CHECK-PWR7-NEXT: std r14, 304(r1)
+; CHECK-PWR7-NEXT: ld r26, 464(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: std r3, 112(r1)
-; CHECK-PWR7-NEXT: ld r28, 416(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: clrlwi r10, r10, 24
-; CHECK-PWR7-NEXT: std r25, 256(r1)
-; CHECK-PWR7-NEXT: std r25, 264(r1)
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: srawi r25, r26, 31
-; CHECK-PWR7-NEXT: xor r26, r26, r25
-; CHECK-PWR7-NEXT: ld r23, 376(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r11, r9, 31
-; CHECK-PWR7-NEXT: std r3, 120(r1)
-; CHECK-PWR7-NEXT: sub r4, r26, r25
-; CHECK-PWR7-NEXT: clrlwi r22, r22, 24
-; CHECK-PWR7-NEXT: srawi r7, r8, 31
-; CHECK-PWR7-NEXT: sub r10, r10, r22
-; CHECK-PWR7-NEXT: ld r26, 400(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: xor r9, r9, r11
-; CHECK-PWR7-NEXT: sldi r3, r4, 56
-; CHECK-PWR7-NEXT: srawi r22, r10, 31
-; CHECK-PWR7-NEXT: xor r8, r8, r7
-; CHECK-PWR7-NEXT: xor r10, r10, r22
-; CHECK-PWR7-NEXT: sub r10, r10, r22
-; CHECK-PWR7-NEXT: ld r25, 392(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sub r12, r9, r11
-; CHECK-PWR7-NEXT: lbz r9, 319(r1)
-; CHECK-PWR7-NEXT: lbz r11, 335(r1)
-; CHECK-PWR7-NEXT: std r3, 96(r1)
-; CHECK-PWR7-NEXT: sldi r12, r12, 56
-; CHECK-PWR7-NEXT: std r3, 104(r1)
-; CHECK-PWR7-NEXT: ld r22, 368(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sldi r10, r10, 56
-; CHECK-PWR7-NEXT: std r10, 192(r1)
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: std r12, 272(r1)
-; CHECK-PWR7-NEXT: std r12, 280(r1)
-; CHECK-PWR7-NEXT: srawi r12, r19, 31
-; CHECK-PWR7-NEXT: xor r0, r19, r12
-; CHECK-PWR7-NEXT: ld r19, 344(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sub r3, r0, r12
-; CHECK-PWR7-NEXT: srawi r11, r9, 31
-; CHECK-PWR7-NEXT: std r10, 200(r1)
-; CHECK-PWR7-NEXT: xor r9, r9, r11
+; CHECK-PWR7-NEXT: sldi r22, r22, 56
+; CHECK-PWR7-NEXT: ld r24, 448(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r23, 440(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r14, 312(r1)
+; CHECK-PWR7-NEXT: std r15, 288(r1)
+; CHECK-PWR7-NEXT: std r3, 208(r1)
+; CHECK-PWR7-NEXT: std r3, 216(r1)
+; CHECK-PWR7-NEXT: lwz r3, 60(r1) # 4-byte Folded Reload
+; CHECK-PWR7-NEXT: std r15, 296(r1)
+; CHECK-PWR7-NEXT: ld r21, 424(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r20, 416(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r22, 224(r1)
+; CHECK-PWR7-NEXT: std r22, 232(r1)
+; CHECK-PWR7-NEXT: sub r4, r3, r4
+; CHECK-PWR7-NEXT: std r25, 192(r1)
+; CHECK-PWR7-NEXT: ld r22, 432(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r19, 408(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: srawi r3, r4, 31
+; CHECK-PWR7-NEXT: std r25, 200(r1)
+; CHECK-PWR7-NEXT: ld r25, 456(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r27, 176(r1)
+; CHECK-PWR7-NEXT: std r27, 184(r1)
+; CHECK-PWR7-NEXT: xor r4, r4, r3
+; CHECK-PWR7-NEXT: std r29, 160(r1)
+; CHECK-PWR7-NEXT: ld r27, 472(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r29, 168(r1)
+; CHECK-PWR7-NEXT: std r0, 144(r1)
+; CHECK-PWR7-NEXT: sub r3, r4, r3
+; CHECK-PWR7-NEXT: std r0, 152(r1)
+; CHECK-PWR7-NEXT: ld r29, 488(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r18, 400(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: std r3, 80(r1)
-; CHECK-PWR7-NEXT: std r3, 88(r1)
-; CHECK-PWR7-NEXT: sldi r9, r9, 56
-; CHECK-PWR7-NEXT: std r9, 288(r1)
-; CHECK-PWR7-NEXT: std r9, 296(r1)
-; CHECK-PWR7-NEXT: srawi r9, r20, 31
-; CHECK-PWR7-NEXT: xor r11, r20, r9
-; CHECK-PWR7-NEXT: ld r20, 352(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sub r4, r11, r9
-; CHECK-PWR7-NEXT: sldi r3, r4, 56
+; CHECK-PWR7-NEXT: std r11, 128(r1)
+; CHECK-PWR7-NEXT: ld r17, 392(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r11, 136(r1)
+; CHECK-PWR7-NEXT: std r9, 112(r1)
; CHECK-PWR7-NEXT: std r3, 64(r1)
; CHECK-PWR7-NEXT: std r3, 72(r1)
-; CHECK-PWR7-NEXT: sub r3, r8, r7
-; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: std r3, 48(r1)
-; CHECK-PWR7-NEXT: std r3, 56(r1)
-; CHECK-PWR7-NEXT: addi r3, r1, 288
+; CHECK-PWR7-NEXT: addi r3, r1, 304
+; CHECK-PWR7-NEXT: std r9, 120(r1)
+; CHECK-PWR7-NEXT: ld r15, 376(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r7, 96(r1)
+; CHECK-PWR7-NEXT: std r7, 104(r1)
+; CHECK-PWR7-NEXT: std r5, 80(r1)
+; CHECK-PWR7-NEXT: std r5, 88(r1)
; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 272
+; CHECK-PWR7-NEXT: addi r3, r1, 288
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 256
+; CHECK-PWR7-NEXT: addi r3, r1, 272
+; CHECK-PWR7-NEXT: ld r14, 368(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: vmrghb v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 240
+; CHECK-PWR7-NEXT: addi r3, r1, 256
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 224
+; CHECK-PWR7-NEXT: addi r3, r1, 240
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: vmrghh v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 208
+; CHECK-PWR7-NEXT: addi r3, r1, 224
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 192
+; CHECK-PWR7-NEXT: addi r3, r1, 208
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 176
+; CHECK-PWR7-NEXT: addi r3, r1, 192
; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 160
+; CHECK-PWR7-NEXT: addi r3, r1, 176
; CHECK-PWR7-NEXT: vmrghb v4, v5, v4
; CHECK-PWR7-NEXT: vmrghh v3, v4, v3
; CHECK-PWR7-NEXT: xxmrghw vs0, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 144
+; CHECK-PWR7-NEXT: addi r3, r1, 160
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 128
+; CHECK-PWR7-NEXT: addi r3, r1, 144
; CHECK-PWR7-NEXT: vmrghb v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 112
+; CHECK-PWR7-NEXT: addi r3, r1, 128
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 96
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
+; CHECK-PWR7-NEXT: addi r3, r1, 112
; CHECK-PWR7-NEXT: vmrghh v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 80
+; CHECK-PWR7-NEXT: addi r3, r1, 96
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 64
+; CHECK-PWR7-NEXT: addi r3, r1, 80
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 48
+; CHECK-PWR7-NEXT: addi r3, r1, 64
; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3
; CHECK-PWR7-NEXT: vmrghb v4, v5, v4
; CHECK-PWR7-NEXT: vmrghh v3, v4, v3
; CHECK-PWR7-NEXT: xxmrghw vs1, v3, v2
; CHECK-PWR7-NEXT: xxmrghd v2, vs1, vs0
-; CHECK-PWR7-NEXT: addi r1, r1, 448
+; CHECK-PWR7-NEXT: addi r1, r1, 512
; CHECK-PWR7-NEXT: blr
entry:
%vecext = extractelement <16 x i8> %a, i32 0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
index 4b999b8..6864afe 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
@@ -66,7 +66,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
; RV64IM-NEXT: srli a2, a2, 32
; RV64IM-NEXT: mul a1, a2, a1
; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: subw a0, a0, a1
+; RV64IM-NEXT: sub a0, a0, a1
; RV64IM-NEXT: srliw a0, a0, 1
; RV64IM-NEXT: add a0, a0, a1
; RV64IM-NEXT: srliw a0, a0, 2
@@ -79,7 +79,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
; RV64IMZB-NEXT: zext.w a2, a0
; RV64IMZB-NEXT: mul a1, a2, a1
; RV64IMZB-NEXT: srli a1, a1, 32
-; RV64IMZB-NEXT: subw a0, a0, a1
+; RV64IMZB-NEXT: sub a0, a0, a1
; RV64IMZB-NEXT: srliw a0, a0, 1
; RV64IMZB-NEXT: add a0, a0, a1
; RV64IMZB-NEXT: srliw a0, a0, 2
@@ -250,7 +250,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
; RV64-NEXT: zext.b a2, a0
; RV64-NEXT: mul a1, a2, a1
; RV64-NEXT: srli a1, a1, 8
-; RV64-NEXT: subw a0, a0, a1
+; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: zext.b a0, a0
; RV64-NEXT: srli a0, a0, 1
; RV64-NEXT: add a0, a0, a1
@@ -414,8 +414,7 @@ define i32 @sdiv_constant_srai(i32 %a) nounwind {
; RV64-NEXT: addi a1, a1, 1639
; RV64-NEXT: sext.w a0, a0
; RV64-NEXT: mul a0, a0, a1
-; RV64-NEXT: srai a0, a0, 32
-; RV64-NEXT: sraiw a0, a0, 1
+; RV64-NEXT: srai a0, a0, 33
; RV64-NEXT: srliw a1, a0, 31
; RV64-NEXT: addw a0, a0, a1
; RV64-NEXT: ret
@@ -656,8 +655,6 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
; RV32IM-NEXT: srai a0, a0, 24
; RV32IM-NEXT: mul a0, a0, a1
; RV32IM-NEXT: slli a0, a0, 16
-; RV32IM-NEXT: srai a0, a0, 24
-; RV32IM-NEXT: slli a0, a0, 24
; RV32IM-NEXT: srai a0, a0, 25
; RV32IM-NEXT: zext.b a1, a0
; RV32IM-NEXT: srli a1, a1, 7
@@ -670,9 +667,7 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
; RV32IMZB-NEXT: sext.b a0, a0
; RV32IMZB-NEXT: mul a0, a0, a1
; RV32IMZB-NEXT: sext.h a0, a0
-; RV32IMZB-NEXT: srai a0, a0, 8
-; RV32IMZB-NEXT: sext.b a0, a0
-; RV32IMZB-NEXT: srai a0, a0, 1
+; RV32IMZB-NEXT: srai a0, a0, 9
; RV32IMZB-NEXT: zext.b a1, a0
; RV32IMZB-NEXT: srli a1, a1, 7
; RV32IMZB-NEXT: add a0, a0, a1
@@ -685,8 +680,6 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
; RV64IM-NEXT: srai a0, a0, 56
; RV64IM-NEXT: mul a0, a0, a1
; RV64IM-NEXT: slli a0, a0, 48
-; RV64IM-NEXT: srai a0, a0, 56
-; RV64IM-NEXT: slli a0, a0, 56
; RV64IM-NEXT: srai a0, a0, 57
; RV64IM-NEXT: zext.b a1, a0
; RV64IM-NEXT: srli a1, a1, 7
@@ -699,9 +692,7 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
; RV64IMZB-NEXT: sext.b a0, a0
; RV64IMZB-NEXT: mul a0, a0, a1
; RV64IMZB-NEXT: sext.h a0, a0
-; RV64IMZB-NEXT: srai a0, a0, 8
-; RV64IMZB-NEXT: sext.b a0, a0
-; RV64IMZB-NEXT: srai a0, a0, 1
+; RV64IMZB-NEXT: srai a0, a0, 9
; RV64IMZB-NEXT: zext.b a1, a0
; RV64IMZB-NEXT: srli a1, a1, 7
; RV64IMZB-NEXT: add a0, a0, a1
@@ -816,7 +807,7 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind {
; RV64IM-NEXT: mul a1, a2, a1
; RV64IM-NEXT: slli a1, a1, 48
; RV64IM-NEXT: srai a1, a1, 56
-; RV64IM-NEXT: subw a1, a1, a0
+; RV64IM-NEXT: sub a1, a1, a0
; RV64IM-NEXT: slli a1, a1, 56
; RV64IM-NEXT: srai a0, a1, 58
; RV64IM-NEXT: zext.b a1, a0
@@ -906,8 +897,6 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
; RV32IM-NEXT: addi a1, a1, 1639
; RV32IM-NEXT: srai a0, a0, 16
; RV32IM-NEXT: mul a0, a0, a1
-; RV32IM-NEXT: srai a0, a0, 16
-; RV32IM-NEXT: slli a0, a0, 16
; RV32IM-NEXT: srai a0, a0, 17
; RV32IM-NEXT: slli a1, a0, 16
; RV32IM-NEXT: srli a1, a1, 16
@@ -921,9 +910,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
; RV32IMZB-NEXT: addi a1, a1, 1639
; RV32IMZB-NEXT: sext.h a0, a0
; RV32IMZB-NEXT: mul a0, a0, a1
-; RV32IMZB-NEXT: srai a0, a0, 16
-; RV32IMZB-NEXT: sext.h a0, a0
-; RV32IMZB-NEXT: srai a0, a0, 1
+; RV32IMZB-NEXT: srai a0, a0, 17
; RV32IMZB-NEXT: zext.h a1, a0
; RV32IMZB-NEXT: srli a1, a1, 15
; RV32IMZB-NEXT: add a0, a0, a1
@@ -936,9 +923,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
; RV64IM-NEXT: addi a1, a1, 1639
; RV64IM-NEXT: srai a0, a0, 48
; RV64IM-NEXT: mul a0, a0, a1
-; RV64IM-NEXT: sraiw a0, a0, 16
-; RV64IM-NEXT: slli a0, a0, 48
-; RV64IM-NEXT: srai a0, a0, 49
+; RV64IM-NEXT: sraiw a0, a0, 17
; RV64IM-NEXT: slli a1, a0, 48
; RV64IM-NEXT: srli a1, a1, 48
; RV64IM-NEXT: srli a1, a1, 15
@@ -951,9 +936,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
; RV64IMZB-NEXT: addi a1, a1, 1639
; RV64IMZB-NEXT: sext.h a0, a0
; RV64IMZB-NEXT: mul a0, a0, a1
-; RV64IMZB-NEXT: sraiw a0, a0, 16
-; RV64IMZB-NEXT: sext.h a0, a0
-; RV64IMZB-NEXT: srai a0, a0, 1
+; RV64IMZB-NEXT: sraiw a0, a0, 17
; RV64IMZB-NEXT: zext.h a1, a0
; RV64IMZB-NEXT: srli a1, a1, 15
; RV64IMZB-NEXT: add a0, a0, a1
@@ -1071,7 +1054,7 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind {
; RV64IM-NEXT: srai a2, a2, 48
; RV64IM-NEXT: mul a1, a2, a1
; RV64IM-NEXT: sraiw a1, a1, 16
-; RV64IM-NEXT: subw a1, a1, a0
+; RV64IM-NEXT: sub a1, a1, a0
; RV64IM-NEXT: slli a1, a1, 48
; RV64IM-NEXT: srai a0, a1, 51
; RV64IM-NEXT: slli a1, a0, 48
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll
index a49e94f..620c5ec 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll
@@ -246,17 +246,11 @@ define double @fcvt_d_wu(i32 %a) nounwind {
}
define double @fcvt_d_wu_load(ptr %p) nounwind {
-; RV32IFD-LABEL: fcvt_d_wu_load:
-; RV32IFD: # %bb.0:
-; RV32IFD-NEXT: lw a0, 0(a0)
-; RV32IFD-NEXT: fcvt.d.wu fa0, a0
-; RV32IFD-NEXT: ret
-;
-; RV64IFD-LABEL: fcvt_d_wu_load:
-; RV64IFD: # %bb.0:
-; RV64IFD-NEXT: lwu a0, 0(a0)
-; RV64IFD-NEXT: fcvt.d.wu fa0, a0
-; RV64IFD-NEXT: ret
+; CHECKIFD-LABEL: fcvt_d_wu_load:
+; CHECKIFD: # %bb.0:
+; CHECKIFD-NEXT: lw a0, 0(a0)
+; CHECKIFD-NEXT: fcvt.d.wu fa0, a0
+; CHECKIFD-NEXT: ret
;
; RV32I-LABEL: fcvt_d_wu_load:
; RV32I: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll
index fa09362..bbea792 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll
@@ -232,17 +232,11 @@ define float @fcvt_s_wu(i32 %a) nounwind {
}
define float @fcvt_s_wu_load(ptr %p) nounwind {
-; RV32IF-LABEL: fcvt_s_wu_load:
-; RV32IF: # %bb.0:
-; RV32IF-NEXT: lw a0, 0(a0)
-; RV32IF-NEXT: fcvt.s.wu fa0, a0
-; RV32IF-NEXT: ret
-;
-; RV64IF-LABEL: fcvt_s_wu_load:
-; RV64IF: # %bb.0:
-; RV64IF-NEXT: lwu a0, 0(a0)
-; RV64IF-NEXT: fcvt.s.wu fa0, a0
-; RV64IF-NEXT: ret
+; CHECKIF-LABEL: fcvt_s_wu_load:
+; CHECKIF: # %bb.0:
+; CHECKIF-NEXT: lw a0, 0(a0)
+; CHECKIF-NEXT: fcvt.s.wu fa0, a0
+; CHECKIF-NEXT: ret
;
; RV32I-LABEL: fcvt_s_wu_load:
; RV32I: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
index 78a2227b..a7c1c63 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
@@ -88,8 +88,7 @@ body: |
; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASSERT_SEXT]], [[ASHR]]
; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[SEXT_INREG]], [[ASHR]]
- ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[XOR]], 32
- ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG1]](s64)
+ ; RV64I-NEXT: $x10 = COPY [[XOR]](s64)
; RV64I-NEXT: PseudoRET implicit $x10
;
; RV64ZBB-LABEL: name: abs_i32
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
index 8a786fc..46d1661 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
@@ -29,7 +29,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotl_32:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -55,7 +55,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotl_32:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: negw a2, a1
+; RV64XTHEADBB-NEXT: neg a2, a1
; RV64XTHEADBB-NEXT: sllw a1, a0, a1
; RV64XTHEADBB-NEXT: srlw a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -78,7 +78,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotr_32:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -104,7 +104,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotr_32:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: negw a2, a1
+; RV64XTHEADBB-NEXT: neg a2, a1
; RV64XTHEADBB-NEXT: srlw a1, a0, a1
; RV64XTHEADBB-NEXT: sllw a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -167,7 +167,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotl_64:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -276,7 +276,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotl_64:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: negw a2, a1
+; RV64XTHEADBB-NEXT: neg a2, a1
; RV64XTHEADBB-NEXT: sll a1, a0, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -340,7 +340,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotr_64:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -451,7 +451,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotr_64:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: negw a2, a1
+; RV64XTHEADBB-NEXT: neg a2, a1
; RV64XTHEADBB-NEXT: srl a1, a0, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -474,7 +474,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotl_32_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -490,7 +490,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64ZBB-LABEL: rotl_32_mask:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: negw a2, a1
+; RV64ZBB-NEXT: neg a2, a1
; RV64ZBB-NEXT: sllw a1, a0, a1
; RV64ZBB-NEXT: srlw a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -506,7 +506,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotl_32_mask:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: negw a2, a1
+; RV64XTHEADBB-NEXT: neg a2, a1
; RV64XTHEADBB-NEXT: sllw a1, a0, a1
; RV64XTHEADBB-NEXT: srlw a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -531,7 +531,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64I-LABEL: rotl_32_mask_and_63_and_31:
; RV64I: # %bb.0:
; RV64I-NEXT: sllw a2, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: srlw a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -547,7 +547,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64ZBB-LABEL: rotl_32_mask_and_63_and_31:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: sllw a2, a0, a1
-; RV64ZBB-NEXT: negw a1, a1
+; RV64ZBB-NEXT: neg a1, a1
; RV64ZBB-NEXT: srlw a0, a0, a1
; RV64ZBB-NEXT: or a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -563,7 +563,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sllw a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: srlw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -632,7 +632,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotr_32_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -648,7 +648,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64ZBB-LABEL: rotr_32_mask:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: negw a2, a1
+; RV64ZBB-NEXT: neg a2, a1
; RV64ZBB-NEXT: srlw a1, a0, a1
; RV64ZBB-NEXT: sllw a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -664,7 +664,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotr_32_mask:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: negw a2, a1
+; RV64XTHEADBB-NEXT: neg a2, a1
; RV64XTHEADBB-NEXT: srlw a1, a0, a1
; RV64XTHEADBB-NEXT: sllw a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -689,7 +689,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64I-LABEL: rotr_32_mask_and_63_and_31:
; RV64I: # %bb.0:
; RV64I-NEXT: srlw a2, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: sllw a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -705,7 +705,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64ZBB-LABEL: rotr_32_mask_and_63_and_31:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: srlw a2, a0, a1
-; RV64ZBB-NEXT: negw a1, a1
+; RV64ZBB-NEXT: neg a1, a1
; RV64ZBB-NEXT: sllw a0, a0, a1
; RV64ZBB-NEXT: or a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -721,7 +721,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srlw a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: sllw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -829,7 +829,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotl_64_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -884,7 +884,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64ZBB-LABEL: rotl_64_mask:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: negw a2, a1
+; RV64ZBB-NEXT: neg a2, a1
; RV64ZBB-NEXT: sll a1, a0, a1
; RV64ZBB-NEXT: srl a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -939,7 +939,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotl_64_mask:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: negw a2, a1
+; RV64XTHEADBB-NEXT: neg a2, a1
; RV64XTHEADBB-NEXT: sll a1, a0, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -1005,7 +1005,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64I-LABEL: rotl_64_mask_and_127_and_63:
; RV64I: # %bb.0:
; RV64I-NEXT: sll a2, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: srl a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -1062,7 +1062,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64ZBB-LABEL: rotl_64_mask_and_127_and_63:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: sll a2, a0, a1
-; RV64ZBB-NEXT: negw a1, a1
+; RV64ZBB-NEXT: neg a1, a1
; RV64ZBB-NEXT: srl a0, a0, a1
; RV64ZBB-NEXT: or a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -1119,7 +1119,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sll a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -1277,7 +1277,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotr_64_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -1331,7 +1331,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64ZBB-LABEL: rotr_64_mask:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: negw a2, a1
+; RV64ZBB-NEXT: neg a2, a1
; RV64ZBB-NEXT: srl a1, a0, a1
; RV64ZBB-NEXT: sll a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -1385,7 +1385,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64XTHEADBB-LABEL: rotr_64_mask:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: negw a2, a1
+; RV64XTHEADBB-NEXT: neg a2, a1
; RV64XTHEADBB-NEXT: srl a1, a0, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -1451,7 +1451,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64I-LABEL: rotr_64_mask_and_127_and_63:
; RV64I: # %bb.0:
; RV64I-NEXT: srl a2, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: sll a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -1508,7 +1508,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64ZBB-LABEL: rotr_64_mask_and_127_and_63:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: srl a2, a0, a1
-; RV64ZBB-NEXT: negw a1, a1
+; RV64ZBB-NEXT: neg a1, a1
; RV64ZBB-NEXT: sll a0, a0, a1
; RV64ZBB-NEXT: or a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -1565,7 +1565,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srl a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -1701,7 +1701,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64I: # %bb.0:
; RV64I-NEXT: andi a3, a2, 31
; RV64I-NEXT: sllw a4, a0, a2
-; RV64I-NEXT: negw a3, a3
+; RV64I-NEXT: neg a3, a3
; RV64I-NEXT: srlw a0, a0, a3
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: sllw a1, a1, a2
@@ -1737,7 +1737,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: andi a3, a2, 31
; RV64XTHEADBB-NEXT: sllw a4, a0, a2
-; RV64XTHEADBB-NEXT: negw a3, a3
+; RV64XTHEADBB-NEXT: neg a3, a3
; RV64XTHEADBB-NEXT: srlw a0, a0, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
; RV64XTHEADBB-NEXT: sllw a1, a1, a2
@@ -1822,7 +1822,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64I: # %bb.0:
; RV64I-NEXT: andi a3, a2, 63
; RV64I-NEXT: sll a4, a0, a2
-; RV64I-NEXT: negw a3, a3
+; RV64I-NEXT: neg a3, a3
; RV64I-NEXT: srl a0, a0, a3
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: sll a1, a1, a2
@@ -1972,7 +1972,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: andi a3, a2, 63
; RV64XTHEADBB-NEXT: sll a4, a0, a2
-; RV64XTHEADBB-NEXT: negw a3, a3
+; RV64XTHEADBB-NEXT: neg a3, a3
; RV64XTHEADBB-NEXT: srl a0, a0, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
; RV64XTHEADBB-NEXT: sll a1, a1, a2
@@ -2002,7 +2002,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64I: # %bb.0:
; RV64I-NEXT: andi a3, a2, 31
; RV64I-NEXT: srlw a4, a0, a2
-; RV64I-NEXT: negw a3, a3
+; RV64I-NEXT: neg a3, a3
; RV64I-NEXT: sllw a0, a0, a3
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: sllw a1, a1, a2
@@ -2038,7 +2038,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: andi a3, a2, 31
; RV64XTHEADBB-NEXT: srlw a4, a0, a2
-; RV64XTHEADBB-NEXT: negw a3, a3
+; RV64XTHEADBB-NEXT: neg a3, a3
; RV64XTHEADBB-NEXT: sllw a0, a0, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
; RV64XTHEADBB-NEXT: sllw a1, a1, a2
@@ -2125,7 +2125,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64I: # %bb.0:
; RV64I-NEXT: andi a3, a2, 63
; RV64I-NEXT: srl a4, a0, a2
-; RV64I-NEXT: negw a3, a3
+; RV64I-NEXT: neg a3, a3
; RV64I-NEXT: sll a0, a0, a3
; RV64I-NEXT: or a0, a4, a0
; RV64I-NEXT: sll a1, a1, a2
@@ -2279,7 +2279,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: andi a3, a2, 63
; RV64XTHEADBB-NEXT: srl a4, a0, a2
-; RV64XTHEADBB-NEXT: negw a3, a3
+; RV64XTHEADBB-NEXT: neg a3, a3
; RV64XTHEADBB-NEXT: sll a0, a0, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
; RV64XTHEADBB-NEXT: sll a1, a1, a2
@@ -2312,8 +2312,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64I-NEXT: andi a3, a2, 31
; RV64I-NEXT: sllw a4, a0, a2
; RV64I-NEXT: sllw a2, a1, a2
-; RV64I-NEXT: negw a5, a3
-; RV64I-NEXT: negw a3, a3
+; RV64I-NEXT: neg a5, a3
+; RV64I-NEXT: neg a3, a3
; RV64I-NEXT: srlw a0, a0, a5
; RV64I-NEXT: srlw a1, a1, a3
; RV64I-NEXT: or a0, a4, a0
@@ -2353,8 +2353,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64XTHEADBB-NEXT: andi a3, a2, 31
; RV64XTHEADBB-NEXT: sllw a4, a0, a2
; RV64XTHEADBB-NEXT: sllw a2, a1, a2
-; RV64XTHEADBB-NEXT: negw a5, a3
-; RV64XTHEADBB-NEXT: negw a3, a3
+; RV64XTHEADBB-NEXT: neg a5, a3
+; RV64XTHEADBB-NEXT: neg a3, a3
; RV64XTHEADBB-NEXT: srlw a0, a0, a5
; RV64XTHEADBB-NEXT: srlw a1, a1, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
@@ -2464,7 +2464,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64I-NEXT: andi a3, a2, 63
; RV64I-NEXT: sll a4, a0, a2
; RV64I-NEXT: sll a2, a1, a2
-; RV64I-NEXT: negw a3, a3
+; RV64I-NEXT: neg a3, a3
; RV64I-NEXT: srl a0, a0, a3
; RV64I-NEXT: srl a1, a1, a3
; RV64I-NEXT: or a0, a4, a0
@@ -2664,7 +2664,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64XTHEADBB-NEXT: andi a3, a2, 63
; RV64XTHEADBB-NEXT: sll a4, a0, a2
; RV64XTHEADBB-NEXT: sll a2, a1, a2
-; RV64XTHEADBB-NEXT: negw a3, a3
+; RV64XTHEADBB-NEXT: neg a3, a3
; RV64XTHEADBB-NEXT: srl a0, a0, a3
; RV64XTHEADBB-NEXT: srl a1, a1, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
@@ -2697,8 +2697,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64I-NEXT: andi a3, a2, 31
; RV64I-NEXT: srlw a4, a0, a2
; RV64I-NEXT: srlw a2, a1, a2
-; RV64I-NEXT: negw a5, a3
-; RV64I-NEXT: negw a3, a3
+; RV64I-NEXT: neg a5, a3
+; RV64I-NEXT: neg a3, a3
; RV64I-NEXT: sllw a0, a0, a5
; RV64I-NEXT: sllw a1, a1, a3
; RV64I-NEXT: or a0, a4, a0
@@ -2738,8 +2738,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64XTHEADBB-NEXT: andi a3, a2, 31
; RV64XTHEADBB-NEXT: srlw a4, a0, a2
; RV64XTHEADBB-NEXT: srlw a2, a1, a2
-; RV64XTHEADBB-NEXT: negw a5, a3
-; RV64XTHEADBB-NEXT: negw a3, a3
+; RV64XTHEADBB-NEXT: neg a5, a3
+; RV64XTHEADBB-NEXT: neg a3, a3
; RV64XTHEADBB-NEXT: sllw a0, a0, a5
; RV64XTHEADBB-NEXT: sllw a1, a1, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
@@ -2850,7 +2850,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64I-NEXT: andi a3, a2, 63
; RV64I-NEXT: srl a4, a0, a2
; RV64I-NEXT: srl a2, a1, a2
-; RV64I-NEXT: negw a3, a3
+; RV64I-NEXT: neg a3, a3
; RV64I-NEXT: sll a0, a0, a3
; RV64I-NEXT: sll a1, a1, a3
; RV64I-NEXT: or a0, a4, a0
@@ -3052,7 +3052,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64XTHEADBB-NEXT: andi a3, a2, 63
; RV64XTHEADBB-NEXT: srl a4, a0, a2
; RV64XTHEADBB-NEXT: srl a2, a1, a2
-; RV64XTHEADBB-NEXT: negw a3, a3
+; RV64XTHEADBB-NEXT: neg a3, a3
; RV64XTHEADBB-NEXT: sll a0, a0, a3
; RV64XTHEADBB-NEXT: sll a1, a1, a3
; RV64XTHEADBB-NEXT: or a0, a4, a0
@@ -3116,7 +3116,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
; RV64I-LABEL: rotl_64_zext:
; RV64I: # %bb.0:
; RV64I-NEXT: li a2, 64
-; RV64I-NEXT: subw a2, a2, a1
+; RV64I-NEXT: sub a2, a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -3171,7 +3171,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
; RV64ZBB-LABEL: rotl_64_zext:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: li a2, 64
-; RV64ZBB-NEXT: subw a2, a2, a1
+; RV64ZBB-NEXT: sub a2, a2, a1
; RV64ZBB-NEXT: sll a1, a0, a1
; RV64ZBB-NEXT: srl a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -3226,7 +3226,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_64_zext:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: li a2, 64
-; RV64XTHEADBB-NEXT: subw a2, a2, a1
+; RV64XTHEADBB-NEXT: sub a2, a2, a1
; RV64XTHEADBB-NEXT: sll a1, a0, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
@@ -3289,7 +3289,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
; RV64I-LABEL: rotr_64_zext:
; RV64I: # %bb.0:
; RV64I-NEXT: li a2, 64
-; RV64I-NEXT: subw a2, a2, a1
+; RV64I-NEXT: sub a2, a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -3343,7 +3343,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
; RV64ZBB-LABEL: rotr_64_zext:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: li a2, 64
-; RV64ZBB-NEXT: subw a2, a2, a1
+; RV64ZBB-NEXT: sub a2, a2, a1
; RV64ZBB-NEXT: srl a1, a0, a1
; RV64ZBB-NEXT: sll a0, a0, a2
; RV64ZBB-NEXT: or a0, a1, a0
@@ -3397,7 +3397,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_64_zext:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: li a2, 64
-; RV64XTHEADBB-NEXT: subw a2, a2, a1
+; RV64XTHEADBB-NEXT: sub a2, a2, a1
; RV64XTHEADBB-NEXT: srl a1, a0, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a2
; RV64XTHEADBB-NEXT: or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll
index 1eddb8f..b7f84ba 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll
@@ -107,7 +107,7 @@ declare i32 @llvm.fshl.i32(i32, i32, i32)
define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: rol_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -125,7 +125,7 @@ define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
; RV64I-LABEL: rol_i32_nosext:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a3, a1
+; RV64I-NEXT: neg a3, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a3
; RV64I-NEXT: or a0, a1, a0
@@ -146,7 +146,7 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind {
; RV64I-LABEL: rol_i32_neg_constant_rhs:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, -2
-; RV64I-NEXT: negw a2, a0
+; RV64I-NEXT: neg a2, a0
; RV64I-NEXT: sllw a0, a1, a0
; RV64I-NEXT: srlw a1, a1, a2
; RV64I-NEXT: or a0, a0, a1
@@ -166,7 +166,7 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
define i64 @rol_i64(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: rol_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -185,7 +185,7 @@ declare i32 @llvm.fshr.i32(i32, i32, i32)
define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: ror_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -203,7 +203,7 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
; RV64I-LABEL: ror_i32_nosext:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a3, a1
+; RV64I-NEXT: neg a3, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a3
; RV64I-NEXT: or a0, a1, a0
@@ -224,7 +224,7 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind {
; RV64I-LABEL: ror_i32_neg_constant_rhs:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, -2
-; RV64I-NEXT: negw a2, a0
+; RV64I-NEXT: neg a2, a0
; RV64I-NEXT: srlw a0, a1, a0
; RV64I-NEXT: sllw a1, a1, a2
; RV64I-NEXT: or a0, a0, a1
@@ -244,7 +244,7 @@ declare i64 @llvm.fshr.i64(i64, i64, i64)
define i64 @ror_i64(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: ror_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
index 9690302..2dd3bb3 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
@@ -31,7 +31,7 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
@@ -88,7 +88,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
@@ -103,7 +103,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srliw a0, a0, 24
; RV64I-NEXT: li a1, 32
-; RV64I-NEXT: subw a0, a1, a0
+; RV64I-NEXT: sub a0, a1, a0
; RV64I-NEXT: j .LBB1_3
; RV64I-NEXT: .LBB1_2:
; RV64I-NEXT: li a0, 32
@@ -153,7 +153,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
@@ -168,7 +168,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
; RV64I-NEXT: call __muldi3
; RV64I-NEXT: srliw a0, a0, 24
; RV64I-NEXT: li a1, 32
-; RV64I-NEXT: subw a1, a1, a0
+; RV64I-NEXT: sub a1, a1, a0
; RV64I-NEXT: .LBB2_2: # %cond.end
; RV64I-NEXT: subw a0, s0, a1
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
@@ -212,7 +212,7 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
@@ -283,7 +283,7 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
@@ -412,7 +412,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
@@ -455,7 +455,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
@@ -497,7 +497,7 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
@@ -553,7 +553,7 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
@@ -672,7 +672,7 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
@@ -728,7 +728,7 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: srliw a1, a0, 2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: and a1, a1, a2
@@ -748,7 +748,7 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
;
; RV64ZBB-LABEL: ctpop_i32_load:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: lwu a0, 0(a0)
+; RV64ZBB-NEXT: lw a0, 0(a0)
; RV64ZBB-NEXT: cpopw a0, a0
; RV64ZBB-NEXT: ret
%a = load i32, ptr %p
@@ -1053,9 +1053,8 @@ define signext i32 @abs_i32_sext(i32 signext %x) {
; RV64I-LABEL: abs_i32_sext:
; RV64I: # %bb.0:
; RV64I-NEXT: srai a1, a0, 31
-; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: addw a0, a0, a1
; RV64I-NEXT: xor a0, a0, a1
-; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: abs_i32_sext:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
index cd59c9e..ba058ca 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
@@ -114,7 +114,7 @@ define i64 @pack_i64_2(i32 signext %a, i32 signext %b) nounwind {
define i64 @pack_i64_3(ptr %0, ptr %1) {
; RV64I-LABEL: pack_i64_3:
; RV64I: # %bb.0:
-; RV64I-NEXT: lwu a0, 0(a0)
+; RV64I-NEXT: lw a0, 0(a0)
; RV64I-NEXT: lwu a1, 0(a1)
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: or a0, a0, a1
@@ -122,8 +122,8 @@ define i64 @pack_i64_3(ptr %0, ptr %1) {
;
; RV64ZBKB-LABEL: pack_i64_3:
; RV64ZBKB: # %bb.0:
-; RV64ZBKB-NEXT: lwu a0, 0(a0)
-; RV64ZBKB-NEXT: lwu a1, 0(a1)
+; RV64ZBKB-NEXT: lw a0, 0(a0)
+; RV64ZBKB-NEXT: lw a1, 0(a1)
; RV64ZBKB-NEXT: pack a0, a1, a0
; RV64ZBKB-NEXT: ret
%3 = load i32, ptr %0, align 4
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
index 8b262db..d634cc9 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
@@ -330,13 +330,13 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
; RV64I-NEXT: li a3, 64
; RV64I-NEXT: bltu a2, a3, .LBB6_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: subw a4, a2, a3
+; RV64I-NEXT: sub a4, a2, a3
; RV64I-NEXT: srl a4, a1, a4
; RV64I-NEXT: bnez a2, .LBB6_3
; RV64I-NEXT: j .LBB6_4
; RV64I-NEXT: .LBB6_2:
; RV64I-NEXT: srl a4, a0, a2
-; RV64I-NEXT: negw a5, a2
+; RV64I-NEXT: neg a5, a2
; RV64I-NEXT: sll a5, a1, a5
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: beqz a2, .LBB6_4
@@ -476,13 +476,13 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
; RV64I-NEXT: li a3, 64
; RV64I-NEXT: bltu a2, a3, .LBB7_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: subw a4, a2, a3
+; RV64I-NEXT: sub a4, a2, a3
; RV64I-NEXT: sra a4, a1, a4
; RV64I-NEXT: bnez a2, .LBB7_3
; RV64I-NEXT: j .LBB7_4
; RV64I-NEXT: .LBB7_2:
; RV64I-NEXT: srl a4, a0, a2
-; RV64I-NEXT: negw a5, a2
+; RV64I-NEXT: neg a5, a2
; RV64I-NEXT: sll a5, a1, a5
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: beqz a2, .LBB7_4
@@ -615,13 +615,13 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
; RV64I-NEXT: bltu a2, a4, .LBB8_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: li a0, 0
-; RV64I-NEXT: subw a4, a2, a4
+; RV64I-NEXT: sub a4, a2, a4
; RV64I-NEXT: sll a3, a3, a4
; RV64I-NEXT: bnez a2, .LBB8_3
; RV64I-NEXT: j .LBB8_4
; RV64I-NEXT: .LBB8_2:
; RV64I-NEXT: sll a0, a3, a2
-; RV64I-NEXT: negw a4, a2
+; RV64I-NEXT: neg a4, a2
; RV64I-NEXT: srl a3, a3, a4
; RV64I-NEXT: sll a4, a1, a2
; RV64I-NEXT: or a3, a3, a4
@@ -685,7 +685,7 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
;
; RV64I-LABEL: fshr64_minsize:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -914,12 +914,12 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
; RV64I-NEXT: li a4, 64
; RV64I-NEXT: bltu a5, a4, .LBB10_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: subw a3, a5, a4
+; RV64I-NEXT: sub a3, a5, a4
; RV64I-NEXT: srl a6, a1, a3
; RV64I-NEXT: j .LBB10_3
; RV64I-NEXT: .LBB10_2:
; RV64I-NEXT: srl a3, a0, a2
-; RV64I-NEXT: negw a6, a5
+; RV64I-NEXT: neg a6, a5
; RV64I-NEXT: sll a6, a1, a6
; RV64I-NEXT: or a6, a3, a6
; RV64I-NEXT: .LBB10_3:
@@ -928,7 +928,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
; RV64I-NEXT: # %bb.4:
; RV64I-NEXT: mv a3, a6
; RV64I-NEXT: .LBB10_5:
-; RV64I-NEXT: negw a7, a2
+; RV64I-NEXT: neg a7, a2
; RV64I-NEXT: bltu a5, a4, .LBB10_7
; RV64I-NEXT: # %bb.6:
; RV64I-NEXT: li a2, 0
@@ -940,13 +940,13 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
; RV64I-NEXT: bltu a6, a4, .LBB10_10
; RV64I-NEXT: # %bb.9:
; RV64I-NEXT: li a5, 0
-; RV64I-NEXT: subw a4, a6, a4
+; RV64I-NEXT: sub a4, a6, a4
; RV64I-NEXT: sll a0, a0, a4
; RV64I-NEXT: bnez a6, .LBB10_11
; RV64I-NEXT: j .LBB10_12
; RV64I-NEXT: .LBB10_10:
; RV64I-NEXT: sll a5, a0, a7
-; RV64I-NEXT: negw a4, a6
+; RV64I-NEXT: neg a4, a6
; RV64I-NEXT: srl a0, a0, a4
; RV64I-NEXT: sll a4, a1, a7
; RV64I-NEXT: or a0, a0, a4
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
index 69519c0..014b1c1 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -758,13 +758,13 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a3, a6, a7
; RV64I-NEXT: bltu a1, a4, .LBB6_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: subw a5, a1, a4
+; RV64I-NEXT: sub a5, a1, a4
; RV64I-NEXT: srl a5, a3, a5
; RV64I-NEXT: bnez a1, .LBB6_3
; RV64I-NEXT: j .LBB6_4
; RV64I-NEXT: .LBB6_2:
; RV64I-NEXT: srl a5, a0, a1
-; RV64I-NEXT: negw a6, a1
+; RV64I-NEXT: neg a6, a1
; RV64I-NEXT: sll a6, a3, a6
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: beqz a1, .LBB6_4
@@ -1091,13 +1091,13 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: or a3, a6, a7
; RV64I-NEXT: bltu a1, a4, .LBB7_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: subw a5, a1, a4
+; RV64I-NEXT: sub a5, a1, a4
; RV64I-NEXT: srl a5, a3, a5
; RV64I-NEXT: bnez a1, .LBB7_3
; RV64I-NEXT: j .LBB7_4
; RV64I-NEXT: .LBB7_2:
; RV64I-NEXT: srl a5, a0, a1
-; RV64I-NEXT: negw a6, a1
+; RV64I-NEXT: neg a6, a1
; RV64I-NEXT: sll a6, a3, a6
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: beqz a1, .LBB7_4
@@ -1425,13 +1425,13 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bltu a3, a5, .LBB8_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: li a1, 0
-; RV64I-NEXT: subw a5, a3, a5
+; RV64I-NEXT: sub a5, a3, a5
; RV64I-NEXT: sll a4, a4, a5
; RV64I-NEXT: bnez a3, .LBB8_3
; RV64I-NEXT: j .LBB8_4
; RV64I-NEXT: .LBB8_2:
; RV64I-NEXT: sll a1, a4, a3
-; RV64I-NEXT: negw a5, a3
+; RV64I-NEXT: neg a5, a3
; RV64I-NEXT: srl a4, a4, a5
; RV64I-NEXT: sll a5, a0, a3
; RV64I-NEXT: or a4, a4, a5
@@ -1754,13 +1754,13 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV64I-NEXT: bltu a3, a5, .LBB9_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: li a1, 0
-; RV64I-NEXT: subw a5, a3, a5
+; RV64I-NEXT: sub a5, a3, a5
; RV64I-NEXT: sll a4, a4, a5
; RV64I-NEXT: bnez a3, .LBB9_3
; RV64I-NEXT: j .LBB9_4
; RV64I-NEXT: .LBB9_2:
; RV64I-NEXT: sll a1, a4, a3
-; RV64I-NEXT: negw a5, a3
+; RV64I-NEXT: neg a5, a3
; RV64I-NEXT: srl a4, a4, a5
; RV64I-NEXT: sll a5, a0, a3
; RV64I-NEXT: or a4, a4, a5
@@ -2083,13 +2083,13 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a3, a6, a7
; RV64I-NEXT: bltu a1, a4, .LBB10_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: subw a5, a1, a4
+; RV64I-NEXT: sub a5, a1, a4
; RV64I-NEXT: sra a5, a3, a5
; RV64I-NEXT: bnez a1, .LBB10_3
; RV64I-NEXT: j .LBB10_4
; RV64I-NEXT: .LBB10_2:
; RV64I-NEXT: srl a5, a0, a1
-; RV64I-NEXT: negw a6, a1
+; RV64I-NEXT: neg a6, a1
; RV64I-NEXT: sll a6, a3, a6
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: beqz a1, .LBB10_4
@@ -2416,13 +2416,13 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: or a3, a6, a7
; RV64I-NEXT: bltu a1, a4, .LBB11_2
; RV64I-NEXT: # %bb.1:
-; RV64I-NEXT: subw a5, a1, a4
+; RV64I-NEXT: sub a5, a1, a4
; RV64I-NEXT: sra a5, a3, a5
; RV64I-NEXT: bnez a1, .LBB11_3
; RV64I-NEXT: j .LBB11_4
; RV64I-NEXT: .LBB11_2:
; RV64I-NEXT: srl a5, a0, a1
-; RV64I-NEXT: negw a6, a1
+; RV64I-NEXT: neg a6, a1
; RV64I-NEXT: sll a6, a3, a6
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: beqz a1, .LBB11_4
@@ -2796,8 +2796,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or t0, t5, t3
; RV64I-NEXT: or a5, s0, t6
; RV64I-NEXT: slli a5, a5, 3
-; RV64I-NEXT: subw t1, a5, a7
-; RV64I-NEXT: negw t5, a5
+; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: neg t5, a5
; RV64I-NEXT: sll t3, t0, t5
; RV64I-NEXT: bltu a5, a7, .LBB12_2
; RV64I-NEXT: # %bb.1:
@@ -2842,7 +2842,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bgeu t6, a7, .LBB12_14
; RV64I-NEXT: .LBB12_12:
; RV64I-NEXT: sll t5, a6, t5
-; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: neg s0, t6
; RV64I-NEXT: srl s0, a6, s0
; RV64I-NEXT: or s1, s0, t3
; RV64I-NEXT: j .LBB12_15
@@ -2851,7 +2851,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bltu t6, a7, .LBB12_12
; RV64I-NEXT: .LBB12_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: subw t3, t6, a7
+; RV64I-NEXT: sub t3, t6, a7
; RV64I-NEXT: sll s1, a6, t3
; RV64I-NEXT: .LBB12_15:
; RV64I-NEXT: sub s0, a5, t1
@@ -2862,13 +2862,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: .LBB12_17:
; RV64I-NEXT: bltu s0, a7, .LBB12_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: subw t6, s0, a7
+; RV64I-NEXT: sub t6, s0, a7
; RV64I-NEXT: srl t6, t0, t6
; RV64I-NEXT: bnez s0, .LBB12_20
; RV64I-NEXT: j .LBB12_21
; RV64I-NEXT: .LBB12_19:
; RV64I-NEXT: srl t6, a6, s0
-; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: neg s1, s0
; RV64I-NEXT: sll s1, t0, s1
; RV64I-NEXT: or t6, t6, s1
; RV64I-NEXT: beqz s0, .LBB12_21
@@ -3720,8 +3720,8 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: or t0, t5, t3
; RV64I-NEXT: or a5, s0, t6
; RV64I-NEXT: slli a5, a5, 5
-; RV64I-NEXT: subw t1, a5, a7
-; RV64I-NEXT: negw t5, a5
+; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: neg t5, a5
; RV64I-NEXT: sll t3, t0, t5
; RV64I-NEXT: bltu a5, a7, .LBB13_2
; RV64I-NEXT: # %bb.1:
@@ -3766,7 +3766,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: bgeu t6, a7, .LBB13_14
; RV64I-NEXT: .LBB13_12:
; RV64I-NEXT: sll t5, a6, t5
-; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: neg s0, t6
; RV64I-NEXT: srl s0, a6, s0
; RV64I-NEXT: or s1, s0, t3
; RV64I-NEXT: j .LBB13_15
@@ -3775,7 +3775,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: bltu t6, a7, .LBB13_12
; RV64I-NEXT: .LBB13_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: subw t3, t6, a7
+; RV64I-NEXT: sub t3, t6, a7
; RV64I-NEXT: sll s1, a6, t3
; RV64I-NEXT: .LBB13_15:
; RV64I-NEXT: sub s0, a5, t1
@@ -3786,13 +3786,13 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: .LBB13_17:
; RV64I-NEXT: bltu s0, a7, .LBB13_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: subw t6, s0, a7
+; RV64I-NEXT: sub t6, s0, a7
; RV64I-NEXT: srl t6, t0, t6
; RV64I-NEXT: bnez s0, .LBB13_20
; RV64I-NEXT: j .LBB13_21
; RV64I-NEXT: .LBB13_19:
; RV64I-NEXT: srl t6, a6, s0
-; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: neg s1, s0
; RV64I-NEXT: sll s1, t0, s1
; RV64I-NEXT: or t6, t6, s1
; RV64I-NEXT: beqz s0, .LBB13_21
@@ -4644,8 +4644,8 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: or t0, t5, t3
; RV64I-NEXT: or a5, s0, t6
; RV64I-NEXT: slli a5, a5, 6
-; RV64I-NEXT: subw t1, a5, a7
-; RV64I-NEXT: negw t5, a5
+; RV64I-NEXT: sub t1, a5, a7
+; RV64I-NEXT: neg t5, a5
; RV64I-NEXT: sll t3, t0, t5
; RV64I-NEXT: bltu a5, a7, .LBB14_2
; RV64I-NEXT: # %bb.1:
@@ -4690,7 +4690,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: bgeu t6, a7, .LBB14_14
; RV64I-NEXT: .LBB14_12:
; RV64I-NEXT: sll t5, a6, t5
-; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: neg s0, t6
; RV64I-NEXT: srl s0, a6, s0
; RV64I-NEXT: or s1, s0, t3
; RV64I-NEXT: j .LBB14_15
@@ -4699,7 +4699,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: bltu t6, a7, .LBB14_12
; RV64I-NEXT: .LBB14_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: subw t3, t6, a7
+; RV64I-NEXT: sub t3, t6, a7
; RV64I-NEXT: sll s1, a6, t3
; RV64I-NEXT: .LBB14_15:
; RV64I-NEXT: sub s0, a5, t1
@@ -4710,13 +4710,13 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: .LBB14_17:
; RV64I-NEXT: bltu s0, a7, .LBB14_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: subw t6, s0, a7
+; RV64I-NEXT: sub t6, s0, a7
; RV64I-NEXT: srl t6, t0, t6
; RV64I-NEXT: bnez s0, .LBB14_20
; RV64I-NEXT: j .LBB14_21
; RV64I-NEXT: .LBB14_19:
; RV64I-NEXT: srl t6, a6, s0
-; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: neg s1, s0
; RV64I-NEXT: sll s1, t0, s1
; RV64I-NEXT: or t6, t6, s1
; RV64I-NEXT: beqz s0, .LBB14_21
@@ -5542,8 +5542,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, s0, a6
; RV64I-NEXT: or a6, a1, s5
; RV64I-NEXT: slli a6, a6, 3
-; RV64I-NEXT: subw t2, a6, t0
-; RV64I-NEXT: negw t3, a6
+; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: neg t3, a6
; RV64I-NEXT: srl s0, t1, t3
; RV64I-NEXT: bltu a6, t0, .LBB15_2
; RV64I-NEXT: # %bb.1:
@@ -5585,11 +5585,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: slli s4, s9, 16
; RV64I-NEXT: bltu a4, t0, .LBB15_7
; RV64I-NEXT: # %bb.6:
-; RV64I-NEXT: subw s0, a4, t0
+; RV64I-NEXT: sub s0, a4, t0
; RV64I-NEXT: srl s0, a5, s0
; RV64I-NEXT: j .LBB15_8
; RV64I-NEXT: .LBB15_7:
-; RV64I-NEXT: negw s6, a4
+; RV64I-NEXT: neg s6, a4
; RV64I-NEXT: sll s6, a5, s6
; RV64I-NEXT: or s0, s0, s6
; RV64I-NEXT: .LBB15_8:
@@ -5637,13 +5637,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bltu s0, t0, .LBB15_20
; RV64I-NEXT: # %bb.19:
; RV64I-NEXT: li t2, 0
-; RV64I-NEXT: subw t0, s0, t0
+; RV64I-NEXT: sub t0, s0, t0
; RV64I-NEXT: sll t0, t1, t0
; RV64I-NEXT: bnez s0, .LBB15_21
; RV64I-NEXT: j .LBB15_22
; RV64I-NEXT: .LBB15_20:
; RV64I-NEXT: sll t2, t1, s0
-; RV64I-NEXT: negw t0, s0
+; RV64I-NEXT: neg t0, s0
; RV64I-NEXT: srl t0, t1, t0
; RV64I-NEXT: sll t1, a5, s0
; RV64I-NEXT: or t0, t0, t1
@@ -6456,8 +6456,8 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV64I-NEXT: or a5, s0, a6
; RV64I-NEXT: or a6, a1, s5
; RV64I-NEXT: slli a6, a6, 5
-; RV64I-NEXT: subw t2, a6, t0
-; RV64I-NEXT: negw t3, a6
+; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: neg t3, a6
; RV64I-NEXT: srl s0, t1, t3
; RV64I-NEXT: bltu a6, t0, .LBB16_2
; RV64I-NEXT: # %bb.1:
@@ -6499,11 +6499,11 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV64I-NEXT: slli s4, s9, 16
; RV64I-NEXT: bltu a4, t0, .LBB16_7
; RV64I-NEXT: # %bb.6:
-; RV64I-NEXT: subw s0, a4, t0
+; RV64I-NEXT: sub s0, a4, t0
; RV64I-NEXT: srl s0, a5, s0
; RV64I-NEXT: j .LBB16_8
; RV64I-NEXT: .LBB16_7:
-; RV64I-NEXT: negw s6, a4
+; RV64I-NEXT: neg s6, a4
; RV64I-NEXT: sll s6, a5, s6
; RV64I-NEXT: or s0, s0, s6
; RV64I-NEXT: .LBB16_8:
@@ -6551,13 +6551,13 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
; RV64I-NEXT: bltu s0, t0, .LBB16_20
; RV64I-NEXT: # %bb.19:
; RV64I-NEXT: li t2, 0
-; RV64I-NEXT: subw t0, s0, t0
+; RV64I-NEXT: sub t0, s0, t0
; RV64I-NEXT: sll t0, t1, t0
; RV64I-NEXT: bnez s0, .LBB16_21
; RV64I-NEXT: j .LBB16_22
; RV64I-NEXT: .LBB16_20:
; RV64I-NEXT: sll t2, t1, s0
-; RV64I-NEXT: negw t0, s0
+; RV64I-NEXT: neg t0, s0
; RV64I-NEXT: srl t0, t1, t0
; RV64I-NEXT: sll t1, a5, s0
; RV64I-NEXT: or t0, t0, t1
@@ -7370,8 +7370,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: or a5, s0, a6
; RV64I-NEXT: or a6, a1, s5
; RV64I-NEXT: slli a6, a6, 6
-; RV64I-NEXT: subw t2, a6, t0
-; RV64I-NEXT: negw t3, a6
+; RV64I-NEXT: sub t2, a6, t0
+; RV64I-NEXT: neg t3, a6
; RV64I-NEXT: srl s0, t1, t3
; RV64I-NEXT: bltu a6, t0, .LBB17_2
; RV64I-NEXT: # %bb.1:
@@ -7413,11 +7413,11 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: slli s4, s9, 16
; RV64I-NEXT: bltu a4, t0, .LBB17_7
; RV64I-NEXT: # %bb.6:
-; RV64I-NEXT: subw s0, a4, t0
+; RV64I-NEXT: sub s0, a4, t0
; RV64I-NEXT: srl s0, a5, s0
; RV64I-NEXT: j .LBB17_8
; RV64I-NEXT: .LBB17_7:
-; RV64I-NEXT: negw s6, a4
+; RV64I-NEXT: neg s6, a4
; RV64I-NEXT: sll s6, a5, s6
; RV64I-NEXT: or s0, s0, s6
; RV64I-NEXT: .LBB17_8:
@@ -7465,13 +7465,13 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; RV64I-NEXT: bltu s0, t0, .LBB17_20
; RV64I-NEXT: # %bb.19:
; RV64I-NEXT: li t2, 0
-; RV64I-NEXT: subw t0, s0, t0
+; RV64I-NEXT: sub t0, s0, t0
; RV64I-NEXT: sll t0, t1, t0
; RV64I-NEXT: bnez s0, .LBB17_21
; RV64I-NEXT: j .LBB17_22
; RV64I-NEXT: .LBB17_20:
; RV64I-NEXT: sll t2, t1, s0
-; RV64I-NEXT: negw t0, s0
+; RV64I-NEXT: neg t0, s0
; RV64I-NEXT: srl t0, t1, t0
; RV64I-NEXT: sll t1, a5, s0
; RV64I-NEXT: or t0, t0, t1
@@ -8310,8 +8310,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, t5, t4
; RV64I-NEXT: or a6, s0, t6
; RV64I-NEXT: slli a6, a6, 3
-; RV64I-NEXT: subw t1, a6, t0
-; RV64I-NEXT: negw t5, a6
+; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: neg t5, a6
; RV64I-NEXT: sll t4, a5, t5
; RV64I-NEXT: bltu a6, t0, .LBB18_2
; RV64I-NEXT: # %bb.1:
@@ -8356,7 +8356,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bgeu t6, t0, .LBB18_14
; RV64I-NEXT: .LBB18_12:
; RV64I-NEXT: sll t5, a7, t5
-; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: neg s0, t6
; RV64I-NEXT: srl s0, a7, s0
; RV64I-NEXT: or s1, s0, t4
; RV64I-NEXT: j .LBB18_15
@@ -8365,7 +8365,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: bltu t6, t0, .LBB18_12
; RV64I-NEXT: .LBB18_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: subw t4, t6, t0
+; RV64I-NEXT: sub t4, t6, t0
; RV64I-NEXT: sll s1, a7, t4
; RV64I-NEXT: .LBB18_15:
; RV64I-NEXT: sub s0, a6, t1
@@ -8376,13 +8376,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: .LBB18_17:
; RV64I-NEXT: bltu s0, t0, .LBB18_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: subw t6, s0, t0
+; RV64I-NEXT: sub t6, s0, t0
; RV64I-NEXT: sra t6, a5, t6
; RV64I-NEXT: bnez s0, .LBB18_20
; RV64I-NEXT: j .LBB18_21
; RV64I-NEXT: .LBB18_19:
; RV64I-NEXT: srl t6, a7, s0
-; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: neg s1, s0
; RV64I-NEXT: sll s1, a5, s1
; RV64I-NEXT: or t6, t6, s1
; RV64I-NEXT: beqz s0, .LBB18_21
@@ -9241,8 +9241,8 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: or a5, t5, t4
; RV64I-NEXT: or a6, s0, t6
; RV64I-NEXT: slli a6, a6, 5
-; RV64I-NEXT: subw t1, a6, t0
-; RV64I-NEXT: negw t5, a6
+; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: neg t5, a6
; RV64I-NEXT: sll t4, a5, t5
; RV64I-NEXT: bltu a6, t0, .LBB19_2
; RV64I-NEXT: # %bb.1:
@@ -9287,7 +9287,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: bgeu t6, t0, .LBB19_14
; RV64I-NEXT: .LBB19_12:
; RV64I-NEXT: sll t5, a7, t5
-; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: neg s0, t6
; RV64I-NEXT: srl s0, a7, s0
; RV64I-NEXT: or s1, s0, t4
; RV64I-NEXT: j .LBB19_15
@@ -9296,7 +9296,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: bltu t6, t0, .LBB19_12
; RV64I-NEXT: .LBB19_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: subw t4, t6, t0
+; RV64I-NEXT: sub t4, t6, t0
; RV64I-NEXT: sll s1, a7, t4
; RV64I-NEXT: .LBB19_15:
; RV64I-NEXT: sub s0, a6, t1
@@ -9307,13 +9307,13 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
; RV64I-NEXT: .LBB19_17:
; RV64I-NEXT: bltu s0, t0, .LBB19_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: subw t6, s0, t0
+; RV64I-NEXT: sub t6, s0, t0
; RV64I-NEXT: sra t6, a5, t6
; RV64I-NEXT: bnez s0, .LBB19_20
; RV64I-NEXT: j .LBB19_21
; RV64I-NEXT: .LBB19_19:
; RV64I-NEXT: srl t6, a7, s0
-; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: neg s1, s0
; RV64I-NEXT: sll s1, a5, s1
; RV64I-NEXT: or t6, t6, s1
; RV64I-NEXT: beqz s0, .LBB19_21
@@ -10172,8 +10172,8 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: or a5, t5, t4
; RV64I-NEXT: or a6, s0, t6
; RV64I-NEXT: slli a6, a6, 6
-; RV64I-NEXT: subw t1, a6, t0
-; RV64I-NEXT: negw t5, a6
+; RV64I-NEXT: sub t1, a6, t0
+; RV64I-NEXT: neg t5, a6
; RV64I-NEXT: sll t4, a5, t5
; RV64I-NEXT: bltu a6, t0, .LBB20_2
; RV64I-NEXT: # %bb.1:
@@ -10218,7 +10218,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: bgeu t6, t0, .LBB20_14
; RV64I-NEXT: .LBB20_12:
; RV64I-NEXT: sll t5, a7, t5
-; RV64I-NEXT: negw s0, t6
+; RV64I-NEXT: neg s0, t6
; RV64I-NEXT: srl s0, a7, s0
; RV64I-NEXT: or s1, s0, t4
; RV64I-NEXT: j .LBB20_15
@@ -10227,7 +10227,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: bltu t6, t0, .LBB20_12
; RV64I-NEXT: .LBB20_14:
; RV64I-NEXT: li t5, 0
-; RV64I-NEXT: subw t4, t6, t0
+; RV64I-NEXT: sub t4, t6, t0
; RV64I-NEXT: sll s1, a7, t4
; RV64I-NEXT: .LBB20_15:
; RV64I-NEXT: sub s0, a6, t1
@@ -10238,13 +10238,13 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; RV64I-NEXT: .LBB20_17:
; RV64I-NEXT: bltu s0, t0, .LBB20_19
; RV64I-NEXT: # %bb.18:
-; RV64I-NEXT: subw t6, s0, t0
+; RV64I-NEXT: sub t6, s0, t0
; RV64I-NEXT: sra t6, a5, t6
; RV64I-NEXT: bnez s0, .LBB20_20
; RV64I-NEXT: j .LBB20_21
; RV64I-NEXT: .LBB20_19:
; RV64I-NEXT: srl t6, a7, s0
-; RV64I-NEXT: negw s1, s0
+; RV64I-NEXT: neg s1, s0
; RV64I-NEXT: sll s1, a5, s1
; RV64I-NEXT: or t6, t6, s1
; RV64I-NEXT: beqz s0, .LBB20_21
diff --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll
index 3fb0f2c..41f73f5 100644
--- a/llvm/test/CodeGen/RISCV/abds-neg.ll
+++ b/llvm/test/CodeGen/RISCV/abds-neg.ll
@@ -2221,7 +2221,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
;
; RV64I-LABEL: abd_subnsw_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: sraiw a1, a0, 31
; RV64I-NEXT: xor a0, a0, a1
; RV64I-NEXT: subw a0, a1, a0
@@ -2236,7 +2236,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
;
; RV64ZBB-LABEL: abd_subnsw_i32:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: subw a0, a0, a1
+; RV64ZBB-NEXT: sub a0, a0, a1
; RV64ZBB-NEXT: sraiw a1, a0, 31
; RV64ZBB-NEXT: xor a0, a0, a1
; RV64ZBB-NEXT: subw a0, a1, a0
@@ -2258,7 +2258,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind {
;
; RV64I-LABEL: abd_subnsw_i32_undef:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: sraiw a1, a0, 31
; RV64I-NEXT: xor a0, a0, a1
; RV64I-NEXT: subw a0, a1, a0
@@ -2273,7 +2273,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind {
;
; RV64ZBB-LABEL: abd_subnsw_i32_undef:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: subw a0, a0, a1
+; RV64ZBB-NEXT: sub a0, a0, a1
; RV64ZBB-NEXT: sraiw a1, a0, 31
; RV64ZBB-NEXT: xor a0, a0, a1
; RV64ZBB-NEXT: subw a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll
index efb4e1a..28a95ef 100644
--- a/llvm/test/CodeGen/RISCV/abds.ll
+++ b/llvm/test/CodeGen/RISCV/abds.ll
@@ -1733,21 +1733,13 @@ define i8 @abd_subnsw_i8(i8 %a, i8 %b) nounwind {
; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: ret
;
-; RV32ZBB-LABEL: abd_subnsw_i8:
-; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: sub a0, a0, a1
-; RV32ZBB-NEXT: sext.b a0, a0
-; RV32ZBB-NEXT: neg a1, a0
-; RV32ZBB-NEXT: max a0, a0, a1
-; RV32ZBB-NEXT: ret
-;
-; RV64ZBB-LABEL: abd_subnsw_i8:
-; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: subw a0, a0, a1
-; RV64ZBB-NEXT: sext.b a0, a0
-; RV64ZBB-NEXT: neg a1, a0
-; RV64ZBB-NEXT: max a0, a0, a1
-; RV64ZBB-NEXT: ret
+; ZBB-LABEL: abd_subnsw_i8:
+; ZBB: # %bb.0:
+; ZBB-NEXT: sub a0, a0, a1
+; ZBB-NEXT: sext.b a0, a0
+; ZBB-NEXT: neg a1, a0
+; ZBB-NEXT: max a0, a0, a1
+; ZBB-NEXT: ret
%sub = sub nsw i8 %a, %b
%abs = call i8 @llvm.abs.i8(i8 %sub, i1 false)
ret i8 %abs
@@ -1772,21 +1764,13 @@ define i8 @abd_subnsw_i8_undef(i8 %a, i8 %b) nounwind {
; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: ret
;
-; RV32ZBB-LABEL: abd_subnsw_i8_undef:
-; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: sub a0, a0, a1
-; RV32ZBB-NEXT: sext.b a0, a0
-; RV32ZBB-NEXT: neg a1, a0
-; RV32ZBB-NEXT: max a0, a0, a1
-; RV32ZBB-NEXT: ret
-;
-; RV64ZBB-LABEL: abd_subnsw_i8_undef:
-; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: subw a0, a0, a1
-; RV64ZBB-NEXT: sext.b a0, a0
-; RV64ZBB-NEXT: neg a1, a0
-; RV64ZBB-NEXT: max a0, a0, a1
-; RV64ZBB-NEXT: ret
+; ZBB-LABEL: abd_subnsw_i8_undef:
+; ZBB: # %bb.0:
+; ZBB-NEXT: sub a0, a0, a1
+; ZBB-NEXT: sext.b a0, a0
+; ZBB-NEXT: neg a1, a0
+; ZBB-NEXT: max a0, a0, a1
+; ZBB-NEXT: ret
%sub = sub nsw i8 %a, %b
%abs = call i8 @llvm.abs.i8(i8 %sub, i1 true)
ret i8 %abs
@@ -1811,21 +1795,13 @@ define i16 @abd_subnsw_i16(i16 %a, i16 %b) nounwind {
; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: ret
;
-; RV32ZBB-LABEL: abd_subnsw_i16:
-; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: sub a0, a0, a1
-; RV32ZBB-NEXT: sext.h a0, a0
-; RV32ZBB-NEXT: neg a1, a0
-; RV32ZBB-NEXT: max a0, a0, a1
-; RV32ZBB-NEXT: ret
-;
-; RV64ZBB-LABEL: abd_subnsw_i16:
-; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: subw a0, a0, a1
-; RV64ZBB-NEXT: sext.h a0, a0
-; RV64ZBB-NEXT: neg a1, a0
-; RV64ZBB-NEXT: max a0, a0, a1
-; RV64ZBB-NEXT: ret
+; ZBB-LABEL: abd_subnsw_i16:
+; ZBB: # %bb.0:
+; ZBB-NEXT: sub a0, a0, a1
+; ZBB-NEXT: sext.h a0, a0
+; ZBB-NEXT: neg a1, a0
+; ZBB-NEXT: max a0, a0, a1
+; ZBB-NEXT: ret
%sub = sub nsw i16 %a, %b
%abs = call i16 @llvm.abs.i16(i16 %sub, i1 false)
ret i16 %abs
@@ -1850,21 +1826,13 @@ define i16 @abd_subnsw_i16_undef(i16 %a, i16 %b) nounwind {
; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: ret
;
-; RV32ZBB-LABEL: abd_subnsw_i16_undef:
-; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: sub a0, a0, a1
-; RV32ZBB-NEXT: sext.h a0, a0
-; RV32ZBB-NEXT: neg a1, a0
-; RV32ZBB-NEXT: max a0, a0, a1
-; RV32ZBB-NEXT: ret
-;
-; RV64ZBB-LABEL: abd_subnsw_i16_undef:
-; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: subw a0, a0, a1
-; RV64ZBB-NEXT: sext.h a0, a0
-; RV64ZBB-NEXT: neg a1, a0
-; RV64ZBB-NEXT: max a0, a0, a1
-; RV64ZBB-NEXT: ret
+; ZBB-LABEL: abd_subnsw_i16_undef:
+; ZBB: # %bb.0:
+; ZBB-NEXT: sub a0, a0, a1
+; ZBB-NEXT: sext.h a0, a0
+; ZBB-NEXT: neg a1, a0
+; ZBB-NEXT: max a0, a0, a1
+; ZBB-NEXT: ret
%sub = sub nsw i16 %a, %b
%abs = call i16 @llvm.abs.i16(i16 %sub, i1 true)
ret i16 %abs
@@ -1881,7 +1849,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
;
; RV64I-LABEL: abd_subnsw_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: sraiw a1, a0, 31
; RV64I-NEXT: xor a0, a0, a1
; RV64I-NEXT: subw a0, a0, a1
@@ -1916,7 +1884,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind {
;
; RV64I-LABEL: abd_subnsw_i32_undef:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: sraiw a1, a0, 31
; RV64I-NEXT: xor a0, a0, a1
; RV64I-NEXT: subw a0, a0, a1
@@ -2317,7 +2285,7 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind {
;
; RV64I-LABEL: abd_sub_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: sraiw a1, a0, 31
; RV64I-NEXT: xor a0, a0, a1
; RV64I-NEXT: subw a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
index aac355e..3b2cab2 100644
--- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
+++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
@@ -20,7 +20,7 @@ define i32 @add_mul_combine_accept_a1(i32 %x) {
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: sh1add a1, a0, a0
; RV64IMB-NEXT: slli a0, a0, 5
-; RV64IMB-NEXT: subw a0, a0, a1
+; RV64IMB-NEXT: sub a0, a0, a1
; RV64IMB-NEXT: addiw a0, a0, 1073
; RV64IMB-NEXT: ret
%tmp0 = add i32 %x, 37
@@ -41,7 +41,7 @@ define signext i32 @add_mul_combine_accept_a2(i32 signext %x) {
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: sh1add a1, a0, a0
; RV64IMB-NEXT: slli a0, a0, 5
-; RV64IMB-NEXT: subw a0, a0, a1
+; RV64IMB-NEXT: sub a0, a0, a1
; RV64IMB-NEXT: addiw a0, a0, 1073
; RV64IMB-NEXT: ret
%tmp0 = add i32 %x, 37
@@ -93,7 +93,7 @@ define i32 @add_mul_combine_accept_b1(i32 %x) {
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: sh3add a1, a0, a0
; RV64IMB-NEXT: slli a0, a0, 5
-; RV64IMB-NEXT: subw a0, a0, a1
+; RV64IMB-NEXT: sub a0, a0, a1
; RV64IMB-NEXT: lui a1, 50
; RV64IMB-NEXT: addi a1, a1, 1119
; RV64IMB-NEXT: addw a0, a0, a1
@@ -118,7 +118,7 @@ define signext i32 @add_mul_combine_accept_b2(i32 signext %x) {
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: sh3add a1, a0, a0
; RV64IMB-NEXT: slli a0, a0, 5
-; RV64IMB-NEXT: subw a0, a0, a1
+; RV64IMB-NEXT: sub a0, a0, a1
; RV64IMB-NEXT: lui a1, 50
; RV64IMB-NEXT: addi a1, a1, 1119
; RV64IMB-NEXT: addw a0, a0, a1
@@ -456,7 +456,7 @@ define i32 @add_mul_combine_reject_f1(i32 %x) {
; RV64IMB-NEXT: addi a0, a0, 1972
; RV64IMB-NEXT: sh1add a1, a0, a0
; RV64IMB-NEXT: slli a0, a0, 5
-; RV64IMB-NEXT: subw a0, a0, a1
+; RV64IMB-NEXT: sub a0, a0, a1
; RV64IMB-NEXT: addiw a0, a0, 11
; RV64IMB-NEXT: ret
%tmp0 = mul i32 %x, 29
@@ -479,7 +479,7 @@ define signext i32 @add_mul_combine_reject_f2(i32 signext %x) {
; RV64IMB-NEXT: addi a0, a0, 1972
; RV64IMB-NEXT: sh1add a1, a0, a0
; RV64IMB-NEXT: slli a0, a0, 5
-; RV64IMB-NEXT: subw a0, a0, a1
+; RV64IMB-NEXT: sub a0, a0, a1
; RV64IMB-NEXT: addiw a0, a0, 11
; RV64IMB-NEXT: ret
%tmp0 = mul i32 %x, 29
diff --git a/llvm/test/CodeGen/RISCV/aext-to-sext.ll b/llvm/test/CodeGen/RISCV/aext-to-sext.ll
index f3f71a9..34549a0 100644
--- a/llvm/test/CodeGen/RISCV/aext-to-sext.ll
+++ b/llvm/test/CodeGen/RISCV/aext-to-sext.ll
@@ -16,7 +16,7 @@ define void @quux(i32 signext %arg, i32 signext %arg1) nounwind {
; RV64I-NEXT: addi sp, sp, -16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT: subw s0, a1, a0
+; RV64I-NEXT: sub s0, a1, a0
; RV64I-NEXT: .LBB0_2: # %bb2
; RV64I-NEXT: # =>This Inner Loop Header: Depth=1
; RV64I-NEXT: call hoge
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index bebc097..7d29ac9 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -4582,7 +4582,7 @@ define signext i32 @atomicrmw_and_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: ret
; RV64I-NEXT: .LBB56_2: # %else
-; RV64I-NEXT: lwu a1, 0(a0)
+; RV64I-NEXT: lw a1, 0(a0)
; RV64I-NEXT: andi a2, a1, 1
; RV64I-NEXT: sw a2, 0(a0)
; RV64I-NEXT: sext.w a0, a1
@@ -4700,7 +4700,7 @@ define signext i32 @atomicrmw_nand_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: ret
; RV64I-NEXT: .LBB57_2: # %else
-; RV64I-NEXT: lwu a1, 0(a0)
+; RV64I-NEXT: lw a1, 0(a0)
; RV64I-NEXT: andi a2, a1, 1
; RV64I-NEXT: sw a2, 0(a0)
; RV64I-NEXT: sext.w a0, a1
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
index 27704d1..ea9786d 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
@@ -161,7 +161,7 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
; RV64IA-NEXT: sltu t0, t0, a5
; RV64IA-NEXT: addi t0, t0, -1
; RV64IA-NEXT: and t0, t0, a1
-; RV64IA-NEXT: subw a6, a6, t0
+; RV64IA-NEXT: sub a6, a6, t0
; RV64IA-NEXT: zext.b a6, a6
; RV64IA-NEXT: sllw a6, a6, a0
; RV64IA-NEXT: and a3, a3, a4
@@ -345,7 +345,7 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
; RV64IA-NEXT: sltu t1, t1, a6
; RV64IA-NEXT: addi t1, t1, -1
; RV64IA-NEXT: and t1, t1, a1
-; RV64IA-NEXT: subw a7, a7, t1
+; RV64IA-NEXT: sub a7, a7, t1
; RV64IA-NEXT: and a7, a7, a3
; RV64IA-NEXT: sllw a7, a7, a0
; RV64IA-NEXT: and a4, a4, a5
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index ada1933..4e04f38 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -150,7 +150,7 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
; RV64IA-NEXT: zext.b a7, a5
; RV64IA-NEXT: addi a5, a5, 1
; RV64IA-NEXT: sltu a7, a7, a1
-; RV64IA-NEXT: negw a7, a7
+; RV64IA-NEXT: neg a7, a7
; RV64IA-NEXT: and a5, a7, a5
; RV64IA-NEXT: zext.b a5, a5
; RV64IA-NEXT: sllw a5, a5, a0
@@ -325,7 +325,7 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
; RV64IA-NEXT: addi a6, a6, 1
; RV64IA-NEXT: sltu t0, t0, a1
; RV64IA-NEXT: and a6, a6, a3
-; RV64IA-NEXT: negw t0, t0
+; RV64IA-NEXT: neg t0, t0
; RV64IA-NEXT: and a6, t0, a6
; RV64IA-NEXT: sllw a6, a6, a0
; RV64IA-NEXT: and a4, a4, a5
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index d566069..a28b818 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -435,7 +435,7 @@
; RV32XCVMEM: .attribute 5, "rv32i2p1_xcvmem1p0"
; RV32XCVSIMD: .attribute 5, "rv32i2p1_xcvsimd1p0"
; RV32XCVBI: .attribute 5, "rv32i2p1_xcvbi1p0"
-; RV32XSFVFWMACCQQQ: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl32b1p0_xsfvfwmaccqqq1p0"
+; RV32XSFVFWMACCQQQ: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_xsfvfwmaccqqq1p0"
; RV32XTHEADCMO: .attribute 5, "rv32i2p1_xtheadcmo1p0"
; RV32XTHEADCONDMOV: .attribute 5, "rv32i2p1_xtheadcondmov1p0"
; RV32XTHEADFMEMIDX: .attribute 5, "rv32i2p1_xtheadfmemidx1p0"
@@ -610,7 +610,7 @@
; RV64SVVPTC: .attribute 5, "rv64i2p1_svvptc1p0"
; RV64SVINVAL: .attribute 5, "rv64i2p1_svinval1p0"
; RV64XVENTANACONDOPS: .attribute 5, "rv64i2p1_xventanacondops1p0"
-; RV64XSFVFWMACCQQQ: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl32b1p0_xsfvfwmaccqqq1p0"
+; RV64XSFVFWMACCQQQ: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_xsfvfwmaccqqq1p0"
; RV64XTHEADBA: .attribute 5, "rv64i2p1_xtheadba1p0"
; RV64XTHEADBB: .attribute 5, "rv64i2p1_xtheadbb1p0"
; RV64XTHEADBS: .attribute 5, "rv64i2p1_xtheadbs1p0"
diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
index 3422ea6..6207a17 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
@@ -1074,7 +1074,7 @@ define bfloat @fcvt_bf16_wu_load(ptr %p) nounwind {
;
; CHECK64ZFBFMIN-LABEL: fcvt_bf16_wu_load:
; CHECK64ZFBFMIN: # %bb.0:
-; CHECK64ZFBFMIN-NEXT: lwu a0, 0(a0)
+; CHECK64ZFBFMIN-NEXT: lw a0, 0(a0)
; CHECK64ZFBFMIN-NEXT: fcvt.s.wu fa5, a0
; CHECK64ZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5
; CHECK64ZFBFMIN-NEXT: ret
@@ -1083,7 +1083,7 @@ define bfloat @fcvt_bf16_wu_load(ptr %p) nounwind {
; RV64ID: # %bb.0:
; RV64ID-NEXT: addi sp, sp, -16
; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64ID-NEXT: lwu a0, 0(a0)
+; RV64ID-NEXT: lw a0, 0(a0)
; RV64ID-NEXT: fcvt.s.wu fa0, a0
; RV64ID-NEXT: call __truncsfbf2
; RV64ID-NEXT: fmv.x.w a0, fa0
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 72489185..530980c 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -63,7 +63,7 @@ define i8 @test_cttz_i8(i8 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: srli a1, a0, 1
; RV64NOZBB-NEXT: andi a1, a1, 85
-; RV64NOZBB-NEXT: subw a0, a0, a1
+; RV64NOZBB-NEXT: sub a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 2
; RV64NOZBB-NEXT: andi a0, a0, 51
@@ -262,7 +262,7 @@ define i32 @test_cttz_i32(i32 %a) nounwind {
; RV64I-NEXT: sext.w a1, a0
; RV64I-NEXT: beqz a1, .LBB2_2
; RV64I-NEXT: # %bb.1: # %cond.false
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: slli a1, a0, 6
; RV64I-NEXT: slli a2, a0, 8
@@ -270,16 +270,16 @@ define i32 @test_cttz_i32(i32 %a) nounwind {
; RV64I-NEXT: slli a4, a0, 12
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a0, 16
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 18
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a4, a0, 4
-; RV64I-NEXT: subw a4, a0, a4
+; RV64I-NEXT: sub a4, a0, a4
; RV64I-NEXT: add a1, a4, a1
; RV64I-NEXT: slli a4, a0, 14
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 23
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a0, a0, 27
; RV64I-NEXT: add a1, a1, a3
; RV64I-NEXT: add a0, a2, a0
@@ -318,7 +318,7 @@ define i32 @test_cttz_i32(i32 %a) nounwind {
; RV64M-NEXT: sext.w a1, a0
; RV64M-NEXT: beqz a1, .LBB2_2
; RV64M-NEXT: # %bb.1: # %cond.false
-; RV64M-NEXT: negw a1, a0
+; RV64M-NEXT: neg a1, a0
; RV64M-NEXT: and a0, a0, a1
; RV64M-NEXT: lui a1, 30667
; RV64M-NEXT: addi a1, a1, 1329
@@ -597,7 +597,7 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: srli a1, a0, 1
; RV64NOZBB-NEXT: andi a1, a1, 85
-; RV64NOZBB-NEXT: subw a0, a0, a1
+; RV64NOZBB-NEXT: sub a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 2
; RV64NOZBB-NEXT: andi a0, a0, 51
@@ -743,7 +743,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
;
; RV64I-LABEL: test_cttz_i32_zero_undef:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: slli a1, a0, 6
; RV64I-NEXT: slli a2, a0, 8
@@ -751,16 +751,16 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
; RV64I-NEXT: slli a4, a0, 12
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a0, 16
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 18
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a4, a0, 4
-; RV64I-NEXT: subw a4, a0, a4
+; RV64I-NEXT: sub a4, a0, a4
; RV64I-NEXT: add a1, a4, a1
; RV64I-NEXT: slli a4, a0, 14
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 23
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a0, a0, 27
; RV64I-NEXT: add a1, a1, a3
; RV64I-NEXT: add a0, a2, a0
@@ -788,7 +788,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
;
; RV64M-LABEL: test_cttz_i32_zero_undef:
; RV64M: # %bb.0:
-; RV64M-NEXT: negw a1, a0
+; RV64M-NEXT: neg a1, a0
; RV64M-NEXT: and a0, a0, a1
; RV64M-NEXT: lui a1, 30667
; RV64M-NEXT: addi a1, a1, 1329
@@ -1039,7 +1039,7 @@ define i8 @test_ctlz_i8(i8 %a) nounwind {
; RV64NOZBB-NEXT: not a0, a0
; RV64NOZBB-NEXT: srli a1, a0, 1
; RV64NOZBB-NEXT: andi a1, a1, 85
-; RV64NOZBB-NEXT: subw a0, a0, a1
+; RV64NOZBB-NEXT: sub a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 2
; RV64NOZBB-NEXT: andi a0, a0, 51
@@ -1711,7 +1711,7 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind {
; RV64NOZBB-NEXT: not a0, a0
; RV64NOZBB-NEXT: srli a1, a0, 1
; RV64NOZBB-NEXT: andi a1, a1, 85
-; RV64NOZBB-NEXT: subw a0, a0, a1
+; RV64NOZBB-NEXT: sub a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 2
; RV64NOZBB-NEXT: andi a0, a0, 51
@@ -2296,7 +2296,7 @@ define i8 @test_ctpop_i8(i8 %a) nounwind {
; RV64NOZBB: # %bb.0:
; RV64NOZBB-NEXT: srli a1, a0, 1
; RV64NOZBB-NEXT: andi a1, a1, 85
-; RV64NOZBB-NEXT: subw a0, a0, a1
+; RV64NOZBB-NEXT: sub a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 2
; RV64NOZBB-NEXT: andi a0, a0, 51
@@ -2336,7 +2336,7 @@ define i8 @test_ctpop_i8(i8 %a) nounwind {
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srli a1, a0, 1
; RV64XTHEADBB-NEXT: andi a1, a1, 85
-; RV64XTHEADBB-NEXT: subw a0, a0, a1
+; RV64XTHEADBB-NEXT: sub a0, a0, a1
; RV64XTHEADBB-NEXT: andi a1, a0, 51
; RV64XTHEADBB-NEXT: srli a0, a0, 2
; RV64XTHEADBB-NEXT: andi a0, a0, 51
diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
index 637fb31..a1061fbb 100644
--- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
+++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
@@ -163,7 +163,7 @@ define i64 @ctz_dereferencing_pointer_zext(ptr %b) nounwind {
; RV64I-LABEL: ctz_dereferencing_pointer_zext:
; RV64I: # %bb.0: # %entry
; RV64I-NEXT: lw a0, 0(a0)
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a2, a1, 6
; RV64I-NEXT: slli a3, a1, 8
@@ -171,16 +171,16 @@ define i64 @ctz_dereferencing_pointer_zext(ptr %b) nounwind {
; RV64I-NEXT: slli a5, a1, 12
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: slli a3, a1, 16
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 18
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a5, a1, 4
-; RV64I-NEXT: subw a5, a1, a5
+; RV64I-NEXT: sub a5, a1, a5
; RV64I-NEXT: add a2, a5, a2
; RV64I-NEXT: slli a5, a1, 14
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 23
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a1, a1, 27
; RV64I-NEXT: add a2, a2, a4
; RV64I-NEXT: add a1, a3, a1
@@ -248,7 +248,7 @@ define signext i32 @ctz1(i32 signext %x) nounwind {
;
; RV64I-LABEL: ctz1:
; RV64I: # %bb.0: # %entry
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a2, a1, 6
; RV64I-NEXT: slli a3, a1, 8
@@ -256,16 +256,16 @@ define signext i32 @ctz1(i32 signext %x) nounwind {
; RV64I-NEXT: slli a5, a1, 12
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: slli a3, a1, 16
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 18
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a5, a1, 4
-; RV64I-NEXT: subw a5, a1, a5
+; RV64I-NEXT: sub a5, a1, a5
; RV64I-NEXT: add a2, a5, a2
; RV64I-NEXT: slli a5, a1, 14
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 23
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a1, a1, 27
; RV64I-NEXT: add a2, a2, a4
; RV64I-NEXT: add a1, a3, a1
@@ -331,7 +331,7 @@ define signext i32 @ctz1_flipped(i32 signext %x) nounwind {
;
; RV64I-LABEL: ctz1_flipped:
; RV64I: # %bb.0: # %entry
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a2, a1, 6
; RV64I-NEXT: slli a3, a1, 8
@@ -339,16 +339,16 @@ define signext i32 @ctz1_flipped(i32 signext %x) nounwind {
; RV64I-NEXT: slli a5, a1, 12
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: slli a3, a1, 16
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 18
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a5, a1, 4
-; RV64I-NEXT: subw a5, a1, a5
+; RV64I-NEXT: sub a5, a1, a5
; RV64I-NEXT: add a2, a5, a2
; RV64I-NEXT: slli a5, a1, 14
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 23
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a1, a1, 27
; RV64I-NEXT: add a2, a2, a4
; RV64I-NEXT: add a1, a3, a1
@@ -412,7 +412,7 @@ define signext i32 @ctz2(i32 signext %x) nounwind {
; RV64I: # %bb.0: # %entry
; RV64I-NEXT: beqz a0, .LBB4_2
; RV64I-NEXT: # %bb.1: # %cond.false
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: slli a1, a0, 6
; RV64I-NEXT: slli a2, a0, 8
@@ -420,16 +420,16 @@ define signext i32 @ctz2(i32 signext %x) nounwind {
; RV64I-NEXT: slli a4, a0, 12
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a0, 16
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 18
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a4, a0, 4
-; RV64I-NEXT: subw a4, a0, a4
+; RV64I-NEXT: sub a4, a0, a4
; RV64I-NEXT: add a1, a4, a1
; RV64I-NEXT: slli a4, a0, 14
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 23
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a0, a0, 27
; RV64I-NEXT: add a1, a1, a3
; RV64I-NEXT: add a0, a2, a0
@@ -490,7 +490,7 @@ define signext i32 @ctz3(i32 signext %x) nounwind {
; RV64I: # %bb.0: # %entry
; RV64I-NEXT: beqz a0, .LBB5_2
; RV64I-NEXT: # %bb.1: # %cond.false
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: slli a1, a0, 6
; RV64I-NEXT: slli a2, a0, 8
@@ -498,16 +498,16 @@ define signext i32 @ctz3(i32 signext %x) nounwind {
; RV64I-NEXT: slli a4, a0, 12
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a0, 16
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 18
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a4, a0, 4
-; RV64I-NEXT: subw a4, a0, a4
+; RV64I-NEXT: sub a4, a0, a4
; RV64I-NEXT: add a1, a4, a1
; RV64I-NEXT: slli a4, a0, 14
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 23
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a0, a0, 27
; RV64I-NEXT: add a1, a1, a3
; RV64I-NEXT: add a0, a2, a0
@@ -824,7 +824,7 @@ define signext i32 @ctz5(i32 signext %x) nounwind {
;
; RV64I-LABEL: ctz5:
; RV64I: # %bb.0: # %entry
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a2, a1, 6
; RV64I-NEXT: slli a3, a1, 8
@@ -832,16 +832,16 @@ define signext i32 @ctz5(i32 signext %x) nounwind {
; RV64I-NEXT: slli a5, a1, 12
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: slli a3, a1, 16
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 18
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a5, a1, 4
-; RV64I-NEXT: subw a5, a1, a5
+; RV64I-NEXT: sub a5, a1, a5
; RV64I-NEXT: add a2, a5, a2
; RV64I-NEXT: slli a5, a1, 14
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 23
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a1, a1, 27
; RV64I-NEXT: add a2, a2, a4
; RV64I-NEXT: add a1, a3, a1
@@ -907,7 +907,7 @@ define signext i32 @ctz6(i32 signext %x) nounwind {
;
; RV64I-LABEL: ctz6:
; RV64I: # %bb.0: # %entry
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a2, a1, 6
; RV64I-NEXT: slli a3, a1, 8
@@ -915,16 +915,16 @@ define signext i32 @ctz6(i32 signext %x) nounwind {
; RV64I-NEXT: slli a5, a1, 12
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: slli a3, a1, 16
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 18
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a5, a1, 4
-; RV64I-NEXT: subw a5, a1, a5
+; RV64I-NEXT: sub a5, a1, a5
; RV64I-NEXT: add a2, a5, a2
; RV64I-NEXT: slli a5, a1, 14
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 23
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a1, a1, 27
; RV64I-NEXT: add a2, a2, a4
; RV64I-NEXT: add a1, a3, a1
@@ -997,7 +997,7 @@ define signext i32 @globalVar() nounwind {
; RV64I: # %bb.0: # %entry
; RV64I-NEXT: lui a0, %hi(global_x)
; RV64I-NEXT: lw a0, %lo(global_x)(a0)
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a2, a1, 6
; RV64I-NEXT: slli a3, a1, 8
@@ -1005,16 +1005,16 @@ define signext i32 @globalVar() nounwind {
; RV64I-NEXT: slli a5, a1, 12
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: slli a3, a1, 16
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 18
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a5, a1, 4
-; RV64I-NEXT: subw a5, a1, a5
+; RV64I-NEXT: sub a5, a1, a5
; RV64I-NEXT: add a2, a5, a2
; RV64I-NEXT: slli a5, a1, 14
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 23
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a1, a1, 27
; RV64I-NEXT: add a2, a2, a4
; RV64I-NEXT: add a1, a3, a1
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index ea8b04d..53c3f58 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -54,7 +54,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
; RV64IM-NEXT: slli a2, a2, 32
; RV64IM-NEXT: mulhu a1, a1, a2
; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: subw a0, a0, a1
+; RV64IM-NEXT: sub a0, a0, a1
; RV64IM-NEXT: srliw a0, a0, 1
; RV64IM-NEXT: add a0, a0, a1
; RV64IM-NEXT: srli a0, a0, 2
@@ -67,7 +67,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
; RV64IMZB-NEXT: addi a2, a2, -1755
; RV64IMZB-NEXT: mul a1, a1, a2
; RV64IMZB-NEXT: srli a1, a1, 32
-; RV64IMZB-NEXT: subw a0, a0, a1
+; RV64IMZB-NEXT: sub a0, a0, a1
; RV64IMZB-NEXT: srliw a0, a0, 1
; RV64IMZB-NEXT: add a0, a0, a1
; RV64IMZB-NEXT: srli a0, a0, 2
@@ -193,7 +193,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
; RV64IM-NEXT: li a2, 37
; RV64IM-NEXT: mul a1, a1, a2
; RV64IM-NEXT: srli a1, a1, 8
-; RV64IM-NEXT: subw a0, a0, a1
+; RV64IM-NEXT: sub a0, a0, a1
; RV64IM-NEXT: slli a0, a0, 56
; RV64IM-NEXT: srli a0, a0, 57
; RV64IM-NEXT: add a0, a0, a1
@@ -206,7 +206,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
; RV64IMZB-NEXT: sh3add a2, a1, a1
; RV64IMZB-NEXT: sh2add a1, a2, a1
; RV64IMZB-NEXT: srli a1, a1, 8
-; RV64IMZB-NEXT: subw a0, a0, a1
+; RV64IMZB-NEXT: sub a0, a0, a1
; RV64IMZB-NEXT: slli a0, a0, 56
; RV64IMZB-NEXT: srli a0, a0, 57
; RV64IMZB-NEXT: add a0, a0, a1
@@ -257,7 +257,7 @@ define i16 @udiv16_constant_add(i16 %a) nounwind {
; RV64-NEXT: lui a2, 149808
; RV64-NEXT: mulhu a1, a1, a2
; RV64-NEXT: srli a1, a1, 16
-; RV64-NEXT: subw a0, a0, a1
+; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: slli a0, a0, 48
; RV64-NEXT: srli a0, a0, 49
; RV64-NEXT: add a0, a0, a1
@@ -367,7 +367,7 @@ define i32 @sdiv_constant_sub_srai(i32 %a) nounwind {
; RV64-NEXT: addi a2, a2, -1171
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: srli a1, a1, 32
-; RV64-NEXT: subw a1, a1, a0
+; RV64-NEXT: sub a1, a1, a0
; RV64-NEXT: srliw a0, a1, 31
; RV64-NEXT: sraiw a1, a1, 2
; RV64-NEXT: add a0, a1, a0
@@ -666,7 +666,7 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind {
; RV64IM-NEXT: srai a1, a1, 56
; RV64IM-NEXT: mul a1, a1, a2
; RV64IM-NEXT: srli a1, a1, 8
-; RV64IM-NEXT: subw a1, a1, a0
+; RV64IM-NEXT: sub a1, a1, a0
; RV64IM-NEXT: slli a1, a1, 56
; RV64IM-NEXT: srli a0, a1, 63
; RV64IM-NEXT: srai a1, a1, 58
@@ -679,7 +679,7 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind {
; RV64IMZB-NEXT: li a2, 109
; RV64IMZB-NEXT: mul a1, a1, a2
; RV64IMZB-NEXT: srli a1, a1, 8
-; RV64IMZB-NEXT: subw a1, a1, a0
+; RV64IMZB-NEXT: sub a1, a1, a0
; RV64IMZB-NEXT: slli a1, a1, 56
; RV64IMZB-NEXT: srli a0, a1, 63
; RV64IMZB-NEXT: srai a1, a1, 58
@@ -889,7 +889,7 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind {
; RV64IM-NEXT: addi a2, a2, 1911
; RV64IM-NEXT: mul a1, a1, a2
; RV64IM-NEXT: srli a1, a1, 16
-; RV64IM-NEXT: subw a1, a1, a0
+; RV64IM-NEXT: sub a1, a1, a0
; RV64IM-NEXT: slli a1, a1, 48
; RV64IM-NEXT: srli a0, a1, 63
; RV64IM-NEXT: srai a1, a1, 51
@@ -903,7 +903,7 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind {
; RV64IMZB-NEXT: addi a2, a2, 1911
; RV64IMZB-NEXT: mul a1, a1, a2
; RV64IMZB-NEXT: srli a1, a1, 16
-; RV64IMZB-NEXT: subw a1, a1, a0
+; RV64IMZB-NEXT: sub a1, a1, a0
; RV64IMZB-NEXT: slli a1, a1, 48
; RV64IMZB-NEXT: srli a0, a1, 63
; RV64IMZB-NEXT: srai a1, a1, 51
diff --git a/llvm/test/CodeGen/RISCV/double-convert-strict.ll b/llvm/test/CodeGen/RISCV/double-convert-strict.ll
index 2b1ec10..9a5e357 100644
--- a/llvm/test/CodeGen/RISCV/double-convert-strict.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert-strict.ll
@@ -347,17 +347,11 @@ define double @fcvt_d_wu(i32 %a) nounwind strictfp {
declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata)
define double @fcvt_d_wu_load(ptr %p) nounwind strictfp {
-; RV32IFD-LABEL: fcvt_d_wu_load:
-; RV32IFD: # %bb.0:
-; RV32IFD-NEXT: lw a0, 0(a0)
-; RV32IFD-NEXT: fcvt.d.wu fa0, a0
-; RV32IFD-NEXT: ret
-;
-; RV64IFD-LABEL: fcvt_d_wu_load:
-; RV64IFD: # %bb.0:
-; RV64IFD-NEXT: lwu a0, 0(a0)
-; RV64IFD-NEXT: fcvt.d.wu fa0, a0
-; RV64IFD-NEXT: ret
+; CHECKIFD-LABEL: fcvt_d_wu_load:
+; CHECKIFD: # %bb.0:
+; CHECKIFD-NEXT: lw a0, 0(a0)
+; CHECKIFD-NEXT: fcvt.d.wu fa0, a0
+; CHECKIFD-NEXT: ret
;
; RV32IZFINXZDINX-LABEL: fcvt_d_wu_load:
; RV32IZFINXZDINX: # %bb.0:
@@ -367,7 +361,7 @@ define double @fcvt_d_wu_load(ptr %p) nounwind strictfp {
;
; RV64IZFINXZDINX-LABEL: fcvt_d_wu_load:
; RV64IZFINXZDINX: # %bb.0:
-; RV64IZFINXZDINX-NEXT: lwu a0, 0(a0)
+; RV64IZFINXZDINX-NEXT: lw a0, 0(a0)
; RV64IZFINXZDINX-NEXT: fcvt.d.wu a0, a0
; RV64IZFINXZDINX-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index fad9e21..a2e6186 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -582,17 +582,11 @@ define double @fcvt_d_wu(i32 %a) nounwind {
}
define double @fcvt_d_wu_load(ptr %p) nounwind {
-; RV32IFD-LABEL: fcvt_d_wu_load:
-; RV32IFD: # %bb.0:
-; RV32IFD-NEXT: lw a0, 0(a0)
-; RV32IFD-NEXT: fcvt.d.wu fa0, a0
-; RV32IFD-NEXT: ret
-;
-; RV64IFD-LABEL: fcvt_d_wu_load:
-; RV64IFD: # %bb.0:
-; RV64IFD-NEXT: lwu a0, 0(a0)
-; RV64IFD-NEXT: fcvt.d.wu fa0, a0
-; RV64IFD-NEXT: ret
+; CHECKIFD-LABEL: fcvt_d_wu_load:
+; CHECKIFD: # %bb.0:
+; CHECKIFD-NEXT: lw a0, 0(a0)
+; CHECKIFD-NEXT: fcvt.d.wu fa0, a0
+; CHECKIFD-NEXT: ret
;
; RV32IZFINXZDINX-LABEL: fcvt_d_wu_load:
; RV32IZFINXZDINX: # %bb.0:
@@ -602,7 +596,7 @@ define double @fcvt_d_wu_load(ptr %p) nounwind {
;
; RV64IZFINXZDINX-LABEL: fcvt_d_wu_load:
; RV64IZFINXZDINX: # %bb.0:
-; RV64IZFINXZDINX-NEXT: lwu a0, 0(a0)
+; RV64IZFINXZDINX-NEXT: lw a0, 0(a0)
; RV64IZFINXZDINX-NEXT: fcvt.d.wu a0, a0
; RV64IZFINXZDINX-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/float-convert-strict.ll b/llvm/test/CodeGen/RISCV/float-convert-strict.ll
index 0c265e1..1b25a2b 100644
--- a/llvm/test/CodeGen/RISCV/float-convert-strict.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert-strict.ll
@@ -236,29 +236,17 @@ define float @fcvt_s_wu(i32 %a) nounwind strictfp {
declare float @llvm.experimental.constrained.uitofp.f32.i32(i32 %a, metadata, metadata)
define float @fcvt_s_wu_load(ptr %p) nounwind strictfp {
-; RV32IF-LABEL: fcvt_s_wu_load:
-; RV32IF: # %bb.0:
-; RV32IF-NEXT: lw a0, 0(a0)
-; RV32IF-NEXT: fcvt.s.wu fa0, a0
-; RV32IF-NEXT: ret
-;
-; RV64IF-LABEL: fcvt_s_wu_load:
-; RV64IF: # %bb.0:
-; RV64IF-NEXT: lwu a0, 0(a0)
-; RV64IF-NEXT: fcvt.s.wu fa0, a0
-; RV64IF-NEXT: ret
-;
-; RV32IZFINX-LABEL: fcvt_s_wu_load:
-; RV32IZFINX: # %bb.0:
-; RV32IZFINX-NEXT: lw a0, 0(a0)
-; RV32IZFINX-NEXT: fcvt.s.wu a0, a0
-; RV32IZFINX-NEXT: ret
+; CHECKIF-LABEL: fcvt_s_wu_load:
+; CHECKIF: # %bb.0:
+; CHECKIF-NEXT: lw a0, 0(a0)
+; CHECKIF-NEXT: fcvt.s.wu fa0, a0
+; CHECKIF-NEXT: ret
;
-; RV64IZFINX-LABEL: fcvt_s_wu_load:
-; RV64IZFINX: # %bb.0:
-; RV64IZFINX-NEXT: lwu a0, 0(a0)
-; RV64IZFINX-NEXT: fcvt.s.wu a0, a0
-; RV64IZFINX-NEXT: ret
+; CHECKIZFINX-LABEL: fcvt_s_wu_load:
+; CHECKIZFINX: # %bb.0:
+; CHECKIZFINX-NEXT: lw a0, 0(a0)
+; CHECKIZFINX-NEXT: fcvt.s.wu a0, a0
+; CHECKIZFINX-NEXT: ret
;
; RV32I-LABEL: fcvt_s_wu_load:
; RV32I: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll
index 1cb7b27..60349a0 100644
--- a/llvm/test/CodeGen/RISCV/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert.ll
@@ -482,29 +482,17 @@ define float @fcvt_s_wu(i32 %a) nounwind {
}
define float @fcvt_s_wu_load(ptr %p) nounwind {
-; RV32IF-LABEL: fcvt_s_wu_load:
-; RV32IF: # %bb.0:
-; RV32IF-NEXT: lw a0, 0(a0)
-; RV32IF-NEXT: fcvt.s.wu fa0, a0
-; RV32IF-NEXT: ret
-;
-; RV64IF-LABEL: fcvt_s_wu_load:
-; RV64IF: # %bb.0:
-; RV64IF-NEXT: lwu a0, 0(a0)
-; RV64IF-NEXT: fcvt.s.wu fa0, a0
-; RV64IF-NEXT: ret
-;
-; RV32IZFINX-LABEL: fcvt_s_wu_load:
-; RV32IZFINX: # %bb.0:
-; RV32IZFINX-NEXT: lw a0, 0(a0)
-; RV32IZFINX-NEXT: fcvt.s.wu a0, a0
-; RV32IZFINX-NEXT: ret
+; CHECKIF-LABEL: fcvt_s_wu_load:
+; CHECKIF: # %bb.0:
+; CHECKIF-NEXT: lw a0, 0(a0)
+; CHECKIF-NEXT: fcvt.s.wu fa0, a0
+; CHECKIF-NEXT: ret
;
-; RV64IZFINX-LABEL: fcvt_s_wu_load:
-; RV64IZFINX: # %bb.0:
-; RV64IZFINX-NEXT: lwu a0, 0(a0)
-; RV64IZFINX-NEXT: fcvt.s.wu a0, a0
-; RV64IZFINX-NEXT: ret
+; CHECKIZFINX-LABEL: fcvt_s_wu_load:
+; CHECKIZFINX: # %bb.0:
+; CHECKIZFINX-NEXT: lw a0, 0(a0)
+; CHECKIZFINX-NEXT: fcvt.s.wu a0, a0
+; CHECKIZFINX-NEXT: ret
;
; RV32I-LABEL: fcvt_s_wu_load:
; RV32I: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 246e6a6..117e3e4 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) {
; RV32IF-NEXT: mv a1, a0
; RV32IF-NEXT: addi a0, sp, 8
; RV32IF-NEXT: call __fixdfti
-; RV32IF-NEXT: lw a0, 8(sp)
-; RV32IF-NEXT: lw a1, 12(sp)
-; RV32IF-NEXT: lw a2, 20(sp)
+; RV32IF-NEXT: lw a0, 20(sp)
+; RV32IF-NEXT: lw a1, 8(sp)
+; RV32IF-NEXT: lw a2, 12(sp)
; RV32IF-NEXT: lw a3, 16(sp)
-; RV32IF-NEXT: beqz a2, .LBB47_2
+; RV32IF-NEXT: beqz a0, .LBB47_2
; RV32IF-NEXT: # %bb.1: # %entry
-; RV32IF-NEXT: slti a4, a2, 0
+; RV32IF-NEXT: slti a4, a0, 0
; RV32IF-NEXT: j .LBB47_3
; RV32IF-NEXT: .LBB47_2:
; RV32IF-NEXT: seqz a4, a3
; RV32IF-NEXT: .LBB47_3: # %entry
; RV32IF-NEXT: xori a3, a3, 1
-; RV32IF-NEXT: or a3, a3, a2
+; RV32IF-NEXT: or a3, a3, a0
; RV32IF-NEXT: seqz a3, a3
; RV32IF-NEXT: addi a3, a3, -1
; RV32IF-NEXT: and a3, a3, a4
; RV32IF-NEXT: neg a3, a3
+; RV32IF-NEXT: and a2, a3, a2
; RV32IF-NEXT: and a1, a3, a1
; RV32IF-NEXT: and a0, a3, a0
-; RV32IF-NEXT: and a2, a3, a2
-; RV32IF-NEXT: slti a2, a2, 0
-; RV32IF-NEXT: addi a2, a2, -1
-; RV32IF-NEXT: and a0, a2, a0
-; RV32IF-NEXT: and a1, a2, a1
+; RV32IF-NEXT: slti a0, a0, 0
+; RV32IF-NEXT: addi a3, a0, -1
+; RV32IF-NEXT: and a0, a3, a1
+; RV32IF-NEXT: and a1, a3, a2
; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IF-NEXT: .cfi_restore ra
; RV32IF-NEXT: addi sp, sp, 32
@@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) {
; RV32IFD-NEXT: .cfi_offset ra, -4
; RV32IFD-NEXT: addi a0, sp, 8
; RV32IFD-NEXT: call __fixdfti
-; RV32IFD-NEXT: lw a0, 8(sp)
-; RV32IFD-NEXT: lw a1, 12(sp)
-; RV32IFD-NEXT: lw a2, 20(sp)
+; RV32IFD-NEXT: lw a0, 20(sp)
+; RV32IFD-NEXT: lw a1, 8(sp)
+; RV32IFD-NEXT: lw a2, 12(sp)
; RV32IFD-NEXT: lw a3, 16(sp)
-; RV32IFD-NEXT: beqz a2, .LBB47_2
+; RV32IFD-NEXT: beqz a0, .LBB47_2
; RV32IFD-NEXT: # %bb.1: # %entry
-; RV32IFD-NEXT: slti a4, a2, 0
+; RV32IFD-NEXT: slti a4, a0, 0
; RV32IFD-NEXT: j .LBB47_3
; RV32IFD-NEXT: .LBB47_2:
; RV32IFD-NEXT: seqz a4, a3
; RV32IFD-NEXT: .LBB47_3: # %entry
; RV32IFD-NEXT: xori a3, a3, 1
-; RV32IFD-NEXT: or a3, a3, a2
+; RV32IFD-NEXT: or a3, a3, a0
; RV32IFD-NEXT: seqz a3, a3
; RV32IFD-NEXT: addi a3, a3, -1
; RV32IFD-NEXT: and a3, a3, a4
; RV32IFD-NEXT: neg a3, a3
+; RV32IFD-NEXT: and a2, a3, a2
; RV32IFD-NEXT: and a1, a3, a1
; RV32IFD-NEXT: and a0, a3, a0
-; RV32IFD-NEXT: and a2, a3, a2
-; RV32IFD-NEXT: slti a2, a2, 0
-; RV32IFD-NEXT: addi a2, a2, -1
-; RV32IFD-NEXT: and a0, a2, a0
-; RV32IFD-NEXT: and a1, a2, a1
+; RV32IFD-NEXT: slti a0, a0, 0
+; RV32IFD-NEXT: addi a3, a0, -1
+; RV32IFD-NEXT: and a0, a3, a1
+; RV32IFD-NEXT: and a1, a3, a2
; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: .cfi_restore ra
; RV32IFD-NEXT: addi sp, sp, 32
@@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) {
; RV32-NEXT: .cfi_offset ra, -4
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: call __fixsfti
-; RV32-NEXT: lw a0, 8(sp)
-; RV32-NEXT: lw a1, 12(sp)
-; RV32-NEXT: lw a2, 20(sp)
+; RV32-NEXT: lw a0, 20(sp)
+; RV32-NEXT: lw a1, 8(sp)
+; RV32-NEXT: lw a2, 12(sp)
; RV32-NEXT: lw a3, 16(sp)
-; RV32-NEXT: beqz a2, .LBB50_2
+; RV32-NEXT: beqz a0, .LBB50_2
; RV32-NEXT: # %bb.1: # %entry
-; RV32-NEXT: slti a4, a2, 0
+; RV32-NEXT: slti a4, a0, 0
; RV32-NEXT: j .LBB50_3
; RV32-NEXT: .LBB50_2:
; RV32-NEXT: seqz a4, a3
; RV32-NEXT: .LBB50_3: # %entry
; RV32-NEXT: xori a3, a3, 1
-; RV32-NEXT: or a3, a3, a2
+; RV32-NEXT: or a3, a3, a0
; RV32-NEXT: seqz a3, a3
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a3, a3, a4
; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: and a1, a3, a1
; RV32-NEXT: and a0, a3, a0
-; RV32-NEXT: and a2, a3, a2
-; RV32-NEXT: slti a2, a2, 0
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: and a0, a2, a0
-; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: slti a0, a0, 0
+; RV32-NEXT: addi a3, a0, -1
+; RV32-NEXT: and a0, a3, a1
+; RV32-NEXT: and a1, a3, a2
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
@@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) {
; RV32-NEXT: call __extendhfsf2
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: call __fixsfti
-; RV32-NEXT: lw a0, 8(sp)
-; RV32-NEXT: lw a1, 12(sp)
-; RV32-NEXT: lw a2, 20(sp)
+; RV32-NEXT: lw a0, 20(sp)
+; RV32-NEXT: lw a1, 8(sp)
+; RV32-NEXT: lw a2, 12(sp)
; RV32-NEXT: lw a3, 16(sp)
-; RV32-NEXT: beqz a2, .LBB53_2
+; RV32-NEXT: beqz a0, .LBB53_2
; RV32-NEXT: # %bb.1: # %entry
-; RV32-NEXT: slti a4, a2, 0
+; RV32-NEXT: slti a4, a0, 0
; RV32-NEXT: j .LBB53_3
; RV32-NEXT: .LBB53_2:
; RV32-NEXT: seqz a4, a3
; RV32-NEXT: .LBB53_3: # %entry
; RV32-NEXT: xori a3, a3, 1
-; RV32-NEXT: or a3, a3, a2
+; RV32-NEXT: or a3, a3, a0
; RV32-NEXT: seqz a3, a3
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a3, a3, a4
; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: and a1, a3, a1
; RV32-NEXT: and a0, a3, a0
-; RV32-NEXT: and a2, a3, a2
-; RV32-NEXT: slti a2, a2, 0
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: and a0, a2, a0
-; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: slti a0, a0, 0
+; RV32-NEXT: addi a3, a0, -1
+; RV32-NEXT: and a0, a3, a1
+; RV32-NEXT: and a1, a3, a2
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
diff --git a/llvm/test/CodeGen/RISCV/half-convert-strict.ll b/llvm/test/CodeGen/RISCV/half-convert-strict.ll
index 0a04d44..675e230 100644
--- a/llvm/test/CodeGen/RISCV/half-convert-strict.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert-strict.ll
@@ -1461,29 +1461,17 @@ define half @fcvt_h_wu(i32 %a) nounwind strictfp {
declare half @llvm.experimental.constrained.uitofp.f16.i32(i32, metadata, metadata)
define half @fcvt_h_wu_load(ptr %p) nounwind strictfp {
-; RV32IZFH-LABEL: fcvt_h_wu_load:
-; RV32IZFH: # %bb.0:
-; RV32IZFH-NEXT: lw a0, 0(a0)
-; RV32IZFH-NEXT: fcvt.h.wu fa0, a0
-; RV32IZFH-NEXT: ret
-;
-; RV64IZFH-LABEL: fcvt_h_wu_load:
-; RV64IZFH: # %bb.0:
-; RV64IZFH-NEXT: lwu a0, 0(a0)
-; RV64IZFH-NEXT: fcvt.h.wu fa0, a0
-; RV64IZFH-NEXT: ret
-;
-; RV32IZHINX-LABEL: fcvt_h_wu_load:
-; RV32IZHINX: # %bb.0:
-; RV32IZHINX-NEXT: lw a0, 0(a0)
-; RV32IZHINX-NEXT: fcvt.h.wu a0, a0
-; RV32IZHINX-NEXT: ret
+; CHECKIZFH-LABEL: fcvt_h_wu_load:
+; CHECKIZFH: # %bb.0:
+; CHECKIZFH-NEXT: lw a0, 0(a0)
+; CHECKIZFH-NEXT: fcvt.h.wu fa0, a0
+; CHECKIZFH-NEXT: ret
;
-; RV64IZHINX-LABEL: fcvt_h_wu_load:
-; RV64IZHINX: # %bb.0:
-; RV64IZHINX-NEXT: lwu a0, 0(a0)
-; RV64IZHINX-NEXT: fcvt.h.wu a0, a0
-; RV64IZHINX-NEXT: ret
+; CHECKIZHINX-LABEL: fcvt_h_wu_load:
+; CHECKIZHINX: # %bb.0:
+; CHECKIZHINX-NEXT: lw a0, 0(a0)
+; CHECKIZHINX-NEXT: fcvt.h.wu a0, a0
+; CHECKIZHINX-NEXT: ret
;
; RV32IDZFH-LABEL: fcvt_h_wu_load:
; RV32IDZFH: # %bb.0:
@@ -1493,7 +1481,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp {
;
; RV64IDZFH-LABEL: fcvt_h_wu_load:
; RV64IDZFH: # %bb.0:
-; RV64IDZFH-NEXT: lwu a0, 0(a0)
+; RV64IDZFH-NEXT: lw a0, 0(a0)
; RV64IDZFH-NEXT: fcvt.h.wu fa0, a0
; RV64IDZFH-NEXT: ret
;
@@ -1505,7 +1493,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp {
;
; RV64IZDINXZHINX-LABEL: fcvt_h_wu_load:
; RV64IZDINXZHINX: # %bb.0:
-; RV64IZDINXZHINX-NEXT: lwu a0, 0(a0)
+; RV64IZDINXZHINX-NEXT: lw a0, 0(a0)
; RV64IZDINXZHINX-NEXT: fcvt.h.wu a0, a0
; RV64IZDINXZHINX-NEXT: ret
;
@@ -1518,7 +1506,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp {
;
; CHECK64-IZFHMIN-LABEL: fcvt_h_wu_load:
; CHECK64-IZFHMIN: # %bb.0:
-; CHECK64-IZFHMIN-NEXT: lwu a0, 0(a0)
+; CHECK64-IZFHMIN-NEXT: lw a0, 0(a0)
; CHECK64-IZFHMIN-NEXT: fcvt.s.wu fa5, a0
; CHECK64-IZFHMIN-NEXT: fcvt.h.s fa0, fa5
; CHECK64-IZFHMIN-NEXT: ret
@@ -1532,7 +1520,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp {
;
; CHECK64-IZHINXMIN-LABEL: fcvt_h_wu_load:
; CHECK64-IZHINXMIN: # %bb.0:
-; CHECK64-IZHINXMIN-NEXT: lwu a0, 0(a0)
+; CHECK64-IZHINXMIN-NEXT: lw a0, 0(a0)
; CHECK64-IZHINXMIN-NEXT: fcvt.s.wu a0, a0
; CHECK64-IZHINXMIN-NEXT: fcvt.h.s a0, a0
; CHECK64-IZHINXMIN-NEXT: ret
@@ -1546,7 +1534,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp {
;
; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_h_wu_load:
; CHECK64-IZDINXZHINXMIN: # %bb.0:
-; CHECK64-IZDINXZHINXMIN-NEXT: lwu a0, 0(a0)
+; CHECK64-IZDINXZHINXMIN-NEXT: lw a0, 0(a0)
; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.wu a0, a0
; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.h.s a0, a0
; CHECK64-IZDINXZHINXMIN-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index c53237e..facb544 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -4388,17 +4388,11 @@ define half @fcvt_h_wu(i32 %a) nounwind {
}
define half @fcvt_h_wu_load(ptr %p) nounwind {
-; RV32IZFH-LABEL: fcvt_h_wu_load:
-; RV32IZFH: # %bb.0:
-; RV32IZFH-NEXT: lw a0, 0(a0)
-; RV32IZFH-NEXT: fcvt.h.wu fa0, a0
-; RV32IZFH-NEXT: ret
-;
-; RV64IZFH-LABEL: fcvt_h_wu_load:
-; RV64IZFH: # %bb.0:
-; RV64IZFH-NEXT: lwu a0, 0(a0)
-; RV64IZFH-NEXT: fcvt.h.wu fa0, a0
-; RV64IZFH-NEXT: ret
+; CHECKIZFH-LABEL: fcvt_h_wu_load:
+; CHECKIZFH: # %bb.0:
+; CHECKIZFH-NEXT: lw a0, 0(a0)
+; CHECKIZFH-NEXT: fcvt.h.wu fa0, a0
+; CHECKIZFH-NEXT: ret
;
; RV32IDZFH-LABEL: fcvt_h_wu_load:
; RV32IDZFH: # %bb.0:
@@ -4408,33 +4402,21 @@ define half @fcvt_h_wu_load(ptr %p) nounwind {
;
; RV64IDZFH-LABEL: fcvt_h_wu_load:
; RV64IDZFH: # %bb.0:
-; RV64IDZFH-NEXT: lwu a0, 0(a0)
+; RV64IDZFH-NEXT: lw a0, 0(a0)
; RV64IDZFH-NEXT: fcvt.h.wu fa0, a0
; RV64IDZFH-NEXT: ret
;
-; RV32IZHINX-LABEL: fcvt_h_wu_load:
-; RV32IZHINX: # %bb.0:
-; RV32IZHINX-NEXT: lw a0, 0(a0)
-; RV32IZHINX-NEXT: fcvt.h.wu a0, a0
-; RV32IZHINX-NEXT: ret
-;
-; RV64IZHINX-LABEL: fcvt_h_wu_load:
-; RV64IZHINX: # %bb.0:
-; RV64IZHINX-NEXT: lwu a0, 0(a0)
-; RV64IZHINX-NEXT: fcvt.h.wu a0, a0
-; RV64IZHINX-NEXT: ret
-;
-; RV32IZDINXZHINX-LABEL: fcvt_h_wu_load:
-; RV32IZDINXZHINX: # %bb.0:
-; RV32IZDINXZHINX-NEXT: lw a0, 0(a0)
-; RV32IZDINXZHINX-NEXT: fcvt.h.wu a0, a0
-; RV32IZDINXZHINX-NEXT: ret
+; CHECKIZHINX-LABEL: fcvt_h_wu_load:
+; CHECKIZHINX: # %bb.0:
+; CHECKIZHINX-NEXT: lw a0, 0(a0)
+; CHECKIZHINX-NEXT: fcvt.h.wu a0, a0
+; CHECKIZHINX-NEXT: ret
;
-; RV64IZDINXZHINX-LABEL: fcvt_h_wu_load:
-; RV64IZDINXZHINX: # %bb.0:
-; RV64IZDINXZHINX-NEXT: lwu a0, 0(a0)
-; RV64IZDINXZHINX-NEXT: fcvt.h.wu a0, a0
-; RV64IZDINXZHINX-NEXT: ret
+; CHECKIZDINXZHINX-LABEL: fcvt_h_wu_load:
+; CHECKIZDINXZHINX: # %bb.0:
+; CHECKIZDINXZHINX-NEXT: lw a0, 0(a0)
+; CHECKIZDINXZHINX-NEXT: fcvt.h.wu a0, a0
+; CHECKIZDINXZHINX-NEXT: ret
;
; RV32I-LABEL: fcvt_h_wu_load:
; RV32I: # %bb.0:
@@ -4476,7 +4458,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind {
; RV64ID-LP64: # %bb.0:
; RV64ID-LP64-NEXT: addi sp, sp, -16
; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64ID-LP64-NEXT: lwu a0, 0(a0)
+; RV64ID-LP64-NEXT: lw a0, 0(a0)
; RV64ID-LP64-NEXT: fcvt.s.wu fa5, a0
; RV64ID-LP64-NEXT: fmv.x.w a0, fa5
; RV64ID-LP64-NEXT: call __truncsfhf2
@@ -4505,7 +4487,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind {
; RV64ID: # %bb.0:
; RV64ID-NEXT: addi sp, sp, -16
; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64ID-NEXT: lwu a0, 0(a0)
+; RV64ID-NEXT: lw a0, 0(a0)
; RV64ID-NEXT: fcvt.s.wu fa0, a0
; RV64ID-NEXT: call __truncsfhf2
; RV64ID-NEXT: fmv.x.w a0, fa0
@@ -4525,7 +4507,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind {
;
; CHECK64-IZFHMIN-LABEL: fcvt_h_wu_load:
; CHECK64-IZFHMIN: # %bb.0:
-; CHECK64-IZFHMIN-NEXT: lwu a0, 0(a0)
+; CHECK64-IZFHMIN-NEXT: lw a0, 0(a0)
; CHECK64-IZFHMIN-NEXT: fcvt.s.wu fa5, a0
; CHECK64-IZFHMIN-NEXT: fcvt.h.s fa0, fa5
; CHECK64-IZFHMIN-NEXT: ret
@@ -4539,7 +4521,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind {
;
; CHECK64-IZHINXMIN-LABEL: fcvt_h_wu_load:
; CHECK64-IZHINXMIN: # %bb.0:
-; CHECK64-IZHINXMIN-NEXT: lwu a0, 0(a0)
+; CHECK64-IZHINXMIN-NEXT: lw a0, 0(a0)
; CHECK64-IZHINXMIN-NEXT: fcvt.s.wu a0, a0
; CHECK64-IZHINXMIN-NEXT: fcvt.h.s a0, a0
; CHECK64-IZHINXMIN-NEXT: ret
@@ -4553,7 +4535,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind {
;
; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_h_wu_load:
; CHECK64-IZDINXZHINXMIN: # %bb.0:
-; CHECK64-IZDINXZHINXMIN-NEXT: lwu a0, 0(a0)
+; CHECK64-IZDINXZHINXMIN-NEXT: lw a0, 0(a0)
; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.wu a0, a0
; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.h.s a0, a0
; CHECK64-IZDINXZHINXMIN-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index 66cde32..774f1a1 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -651,7 +651,7 @@ define void @zext16_abs8(i8 %x, ptr %p) {
; RV64I-NEXT: srai a2, a0, 63
; RV64I-NEXT: srai a0, a0, 56
; RV64I-NEXT: xor a0, a0, a2
-; RV64I-NEXT: subw a0, a0, a2
+; RV64I-NEXT: sub a0, a0, a2
; RV64I-NEXT: sh a0, 0(a1)
; RV64I-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/interrupt-attr.ll b/llvm/test/CodeGen/RISCV/interrupt-attr.ll
index e278b8d..472b903 100644
--- a/llvm/test/CodeGen/RISCV/interrupt-attr.ll
+++ b/llvm/test/CodeGen/RISCV/interrupt-attr.ll
@@ -794,498 +794,46 @@ define void @foo_with_call() #1 {
; CHECK-RV32-V-NEXT: slli a0, a0, 5
; CHECK-RV32-V-NEXT: sub sp, sp, a0
; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 5
-; CHECK-RV32-V-NEXT: sub a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 3
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: slli a0, a0, 3
; CHECK-RV32-V-NEXT: mv a1, a0
; CHECK-RV32-V-NEXT: slli a0, a0, 1
; CHECK-RV32-V-NEXT: add a0, a0, a1
; CHECK-RV32-V-NEXT: add a0, sp, a0
; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 3
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 3
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 4
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: slli a0, a0, 4
; CHECK-RV32-V-NEXT: add a0, sp, a0
; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 4
-; CHECK-RV32-V-NEXT: sub a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 3
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: slli a0, a0, 3
; CHECK-RV32-V-NEXT: add a0, sp, a0
; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 3
-; CHECK-RV32-V-NEXT: sub a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-V-NEXT: addi a0, sp, 16
-; CHECK-RV32-V-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-V-NEXT: call otherfoo
; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 5
-; CHECK-RV32-V-NEXT: sub a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
; CHECK-RV32-V-NEXT: slli a0, a0, 3
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 3
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: mv a1, a0
; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 3
; CHECK-RV32-V-NEXT: add a0, a0, a1
; CHECK-RV32-V-NEXT: add a0, sp, a0
; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 3
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 4
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: slli a0, a0, 4
; CHECK-RV32-V-NEXT: add a0, sp, a0
; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 4
-; CHECK-RV32-V-NEXT: sub a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 3
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: slli a0, a0, 3
; CHECK-RV32-V-NEXT: add a0, sp, a0
; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 3
-; CHECK-RV32-V-NEXT: sub a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: add a0, sp, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, 16
-; CHECK-RV32-V-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-V-NEXT: addi a0, sp, 16
-; CHECK-RV32-V-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: slli a0, a0, 5
; CHECK-RV32-V-NEXT: add sp, sp, a0
@@ -1351,498 +899,46 @@ define void @foo_with_call() #1 {
; CHECK-RV32-FV-NEXT: slli a0, a0, 5
; CHECK-RV32-FV-NEXT: sub sp, sp, a0
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 5
-; CHECK-RV32-FV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 3
; CHECK-RV32-FV-NEXT: mv a1, a0
; CHECK-RV32-FV-NEXT: slli a0, a0, 1
; CHECK-RV32-FV-NEXT: add a0, a0, a1
; CHECK-RV32-FV-NEXT: add a0, sp, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 4
; CHECK-RV32-FV-NEXT: add a0, sp, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 3
; CHECK-RV32-FV-NEXT: add a0, sp, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FV-NEXT: addi a0, sp, 16
-; CHECK-RV32-FV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FV-NEXT: call otherfoo
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 5
-; CHECK-RV32-FV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 3
; CHECK-RV32-FV-NEXT: mv a1, a0
; CHECK-RV32-FV-NEXT: slli a0, a0, 1
; CHECK-RV32-FV-NEXT: add a0, a0, a1
; CHECK-RV32-FV-NEXT: add a0, sp, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 4
; CHECK-RV32-FV-NEXT: add a0, sp, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 3
; CHECK-RV32-FV-NEXT: add a0, sp, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: add a0, sp, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FV-NEXT: addi a0, sp, 16
-; CHECK-RV32-FV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 5
; CHECK-RV32-FV-NEXT: add sp, sp, a0
@@ -1928,498 +1024,46 @@ define void @foo_with_call() #1 {
; CHECK-RV32-FDV-NEXT: slli a0, a0, 5
; CHECK-RV32-FDV-NEXT: sub sp, sp, a0
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 5
-; CHECK-RV32-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: mv a1, a0
; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
; CHECK-RV32-FDV-NEXT: add a0, a0, a1
; CHECK-RV32-FDV-NEXT: add a0, sp, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 4
; CHECK-RV32-FDV-NEXT: add a0, sp, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
; CHECK-RV32-FDV-NEXT: add a0, sp, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FDV-NEXT: addi a0, sp, 16
-; CHECK-RV32-FDV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FDV-NEXT: call otherfoo
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 5
-; CHECK-RV32-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
; CHECK-RV32-FDV-NEXT: mv a1, a0
; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
; CHECK-RV32-FDV-NEXT: add a0, a0, a1
; CHECK-RV32-FDV-NEXT: add a0, sp, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 4
; CHECK-RV32-FDV-NEXT: add a0, sp, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
; CHECK-RV32-FDV-NEXT: add a0, sp, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: add a0, sp, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FDV-NEXT: addi a0, sp, 16
-; CHECK-RV32-FDV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 5
; CHECK-RV32-FDV-NEXT: add sp, sp, a0
@@ -3259,498 +1903,46 @@ define void @foo_with_call() #1 {
; CHECK-RV64-V-NEXT: slli a0, a0, 5
; CHECK-RV64-V-NEXT: sub sp, sp, a0
; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 5
-; CHECK-RV64-V-NEXT: sub a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 3
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: slli a0, a0, 3
; CHECK-RV64-V-NEXT: mv a1, a0
; CHECK-RV64-V-NEXT: slli a0, a0, 1
; CHECK-RV64-V-NEXT: add a0, a0, a1
; CHECK-RV64-V-NEXT: add a0, sp, a0
; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 3
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 3
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 4
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: slli a0, a0, 4
; CHECK-RV64-V-NEXT: add a0, sp, a0
; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 4
-; CHECK-RV64-V-NEXT: sub a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 3
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: slli a0, a0, 3
; CHECK-RV64-V-NEXT: add a0, sp, a0
; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 3
-; CHECK-RV64-V-NEXT: sub a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-V-NEXT: addi a0, sp, 16
-; CHECK-RV64-V-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-V-NEXT: call otherfoo
; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 5
-; CHECK-RV64-V-NEXT: sub a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
; CHECK-RV64-V-NEXT: slli a0, a0, 3
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 3
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: mv a1, a0
; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 3
; CHECK-RV64-V-NEXT: add a0, a0, a1
; CHECK-RV64-V-NEXT: add a0, sp, a0
; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 3
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 4
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: slli a0, a0, 4
; CHECK-RV64-V-NEXT: add a0, sp, a0
; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 4
-; CHECK-RV64-V-NEXT: sub a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 3
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: slli a0, a0, 3
; CHECK-RV64-V-NEXT: add a0, sp, a0
; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 3
-; CHECK-RV64-V-NEXT: sub a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: add a0, sp, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, 16
-; CHECK-RV64-V-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-V-NEXT: addi a0, sp, 16
-; CHECK-RV64-V-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: slli a0, a0, 5
; CHECK-RV64-V-NEXT: add sp, sp, a0
@@ -3816,498 +2008,46 @@ define void @foo_with_call() #1 {
; CHECK-RV64-FV-NEXT: slli a0, a0, 5
; CHECK-RV64-FV-NEXT: sub sp, sp, a0
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 5
-; CHECK-RV64-FV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 3
; CHECK-RV64-FV-NEXT: mv a1, a0
; CHECK-RV64-FV-NEXT: slli a0, a0, 1
; CHECK-RV64-FV-NEXT: add a0, a0, a1
; CHECK-RV64-FV-NEXT: add a0, sp, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 4
; CHECK-RV64-FV-NEXT: add a0, sp, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 3
; CHECK-RV64-FV-NEXT: add a0, sp, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FV-NEXT: addi a0, sp, 16
-; CHECK-RV64-FV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FV-NEXT: call otherfoo
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 5
-; CHECK-RV64-FV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 3
; CHECK-RV64-FV-NEXT: mv a1, a0
; CHECK-RV64-FV-NEXT: slli a0, a0, 1
; CHECK-RV64-FV-NEXT: add a0, a0, a1
; CHECK-RV64-FV-NEXT: add a0, sp, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 4
; CHECK-RV64-FV-NEXT: add a0, sp, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 3
; CHECK-RV64-FV-NEXT: add a0, sp, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: add a0, sp, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FV-NEXT: addi a0, sp, 16
-; CHECK-RV64-FV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 5
; CHECK-RV64-FV-NEXT: add sp, sp, a0
@@ -4393,498 +2133,46 @@ define void @foo_with_call() #1 {
; CHECK-RV64-FDV-NEXT: slli a0, a0, 5
; CHECK-RV64-FDV-NEXT: sub sp, sp, a0
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 5
-; CHECK-RV64-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: mv a1, a0
; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
; CHECK-RV64-FDV-NEXT: add a0, a0, a1
; CHECK-RV64-FDV-NEXT: add a0, sp, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 4
; CHECK-RV64-FDV-NEXT: add a0, sp, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
; CHECK-RV64-FDV-NEXT: add a0, sp, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FDV-NEXT: addi a0, sp, 16
-; CHECK-RV64-FDV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FDV-NEXT: call otherfoo
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 5
-; CHECK-RV64-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
; CHECK-RV64-FDV-NEXT: mv a1, a0
; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
; CHECK-RV64-FDV-NEXT: add a0, a0, a1
; CHECK-RV64-FDV-NEXT: add a0, sp, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 4
; CHECK-RV64-FDV-NEXT: add a0, sp, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
; CHECK-RV64-FDV-NEXT: add a0, sp, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: add a0, sp, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FDV-NEXT: addi a0, sp, 16
-; CHECK-RV64-FDV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 5
; CHECK-RV64-FDV-NEXT: add sp, sp, a0
@@ -5670,422 +2958,39 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV32-V-NEXT: slli a0, a0, 5
; CHECK-RV32-V-NEXT: sub sp, sp, a0
; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 3
-; CHECK-RV32-V-NEXT: sub a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: slli a0, a0, 3
; CHECK-RV32-V-NEXT: sub a0, s0, a0
; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 3
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 4
-; CHECK-RV32-V-NEXT: sub a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: slli a0, a0, 4
; CHECK-RV32-V-NEXT: sub a0, s0, a0
; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 4
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
; CHECK-RV32-V-NEXT: slli a0, a0, 3
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 3
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 3
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 3
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
; CHECK-RV32-V-NEXT: slli a0, a0, 1
; CHECK-RV32-V-NEXT: add a0, a0, a1
; CHECK-RV32-V-NEXT: sub a0, s0, a0
; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 5
-; CHECK-RV32-V-NEXT: sub a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: slli a0, a0, 5
; CHECK-RV32-V-NEXT: sub a0, s0, a0
; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-V-NEXT: call otherfoo
; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 3
-; CHECK-RV32-V-NEXT: sub a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: slli a0, a0, 3
; CHECK-RV32-V-NEXT: sub a0, s0, a0
; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 3
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 4
-; CHECK-RV32-V-NEXT: sub a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: slli a0, a0, 4
; CHECK-RV32-V-NEXT: sub a0, s0, a0
; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 4
-; CHECK-RV32-V-NEXT: add a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 3
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 3
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: slli a0, a0, 3
; CHECK-RV32-V-NEXT: mv a1, a0
@@ -6093,81 +2998,12 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV32-V-NEXT: add a0, a0, a1
; CHECK-RV32-V-NEXT: sub a0, s0, a0
; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 3
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 2
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: mv a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a1, a1, a0
-; CHECK-RV32-V-NEXT: slli a0, a0, 1
-; CHECK-RV32-V-NEXT: add a0, a0, a1
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT: csrr a0, vlenb
-; CHECK-RV32-V-NEXT: slli a1, a0, 5
-; CHECK-RV32-V-NEXT: sub a0, a1, a0
-; CHECK-RV32-V-NEXT: sub a0, s0, a0
-; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-V-NEXT: csrr a0, vlenb
; CHECK-RV32-V-NEXT: slli a0, a0, 5
; CHECK-RV32-V-NEXT: sub a0, s0, a0
; CHECK-RV32-V-NEXT: addi a0, a0, -80
-; CHECK-RV32-V-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-V-NEXT: addi sp, s0, -80
; CHECK-RV32-V-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; CHECK-RV32-V-NEXT: lw t0, 72(sp) # 4-byte Folded Reload
@@ -6234,172 +3070,15 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV32-FV-NEXT: slli a0, a0, 5
; CHECK-RV32-FV-NEXT: sub sp, sp, a0
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 3
; CHECK-RV32-FV-NEXT: sub a0, s0, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 4
; CHECK-RV32-FV-NEXT: sub a0, s0, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 3
; CHECK-RV32-FV-NEXT: mv a1, a0
@@ -6407,331 +3086,36 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV32-FV-NEXT: add a0, a0, a1
; CHECK-RV32-FV-NEXT: sub a0, s0, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 5
-; CHECK-RV32-FV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 5
; CHECK-RV32-FV-NEXT: sub a0, s0, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FV-NEXT: call otherfoo
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 3
; CHECK-RV32-FV-NEXT: sub a0, s0, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 4
; CHECK-RV32-FV-NEXT: sub a0, s0, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FV-NEXT: add a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
; CHECK-RV32-FV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
; CHECK-RV32-FV-NEXT: slli a0, a0, 1
; CHECK-RV32-FV-NEXT: add a0, a0, a1
; CHECK-RV32-FV-NEXT: sub a0, s0, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: mv a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a1, a1, a0
-; CHECK-RV32-FV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FV-NEXT: add a0, a0, a1
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FV-NEXT: slli a1, a0, 5
-; CHECK-RV32-FV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FV-NEXT: csrr a0, vlenb
; CHECK-RV32-FV-NEXT: slli a0, a0, 5
; CHECK-RV32-FV-NEXT: sub a0, s0, a0
; CHECK-RV32-FV-NEXT: addi a0, a0, -160
-; CHECK-RV32-FV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FV-NEXT: addi sp, s0, -160
; CHECK-RV32-FV-NEXT: lw ra, 156(sp) # 4-byte Folded Reload
; CHECK-RV32-FV-NEXT: lw t0, 152(sp) # 4-byte Folded Reload
@@ -6818,172 +3202,15 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV32-FDV-NEXT: slli a0, a0, 5
; CHECK-RV32-FDV-NEXT: sub sp, sp, a0
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 4
; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
; CHECK-RV32-FDV-NEXT: mv a1, a0
@@ -6991,249 +3218,23 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV32-FDV-NEXT: add a0, a0, a1
; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 5
-; CHECK-RV32-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 5
; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-FDV-NEXT: call otherfoo
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 4
; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT: add a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
; CHECK-RV32-FDV-NEXT: mv a1, a0
@@ -7241,81 +3242,12 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV32-FDV-NEXT: add a0, a0, a1
; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: mv a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a1, a1, a0
-; CHECK-RV32-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT: add a0, a0, a1
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT: slli a1, a0, 5
-; CHECK-RV32-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FDV-NEXT: csrr a0, vlenb
; CHECK-RV32-FDV-NEXT: slli a0, a0, 5
; CHECK-RV32-FDV-NEXT: sub a0, s0, a0
; CHECK-RV32-FDV-NEXT: addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-FDV-NEXT: addi sp, s0, -240
; CHECK-RV32-FDV-NEXT: lw ra, 236(sp) # 4-byte Folded Reload
; CHECK-RV32-FDV-NEXT: lw t0, 232(sp) # 4-byte Folded Reload
@@ -8186,422 +4118,39 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV64-V-NEXT: slli a0, a0, 5
; CHECK-RV64-V-NEXT: sub sp, sp, a0
; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 3
-; CHECK-RV64-V-NEXT: sub a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: slli a0, a0, 3
; CHECK-RV64-V-NEXT: sub a0, s0, a0
; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 3
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 4
-; CHECK-RV64-V-NEXT: sub a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: slli a0, a0, 4
; CHECK-RV64-V-NEXT: sub a0, s0, a0
; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 4
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
; CHECK-RV64-V-NEXT: slli a0, a0, 3
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 3
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 3
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 3
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
; CHECK-RV64-V-NEXT: slli a0, a0, 1
; CHECK-RV64-V-NEXT: add a0, a0, a1
; CHECK-RV64-V-NEXT: sub a0, s0, a0
; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 5
-; CHECK-RV64-V-NEXT: sub a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: slli a0, a0, 5
; CHECK-RV64-V-NEXT: sub a0, s0, a0
; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-V-NEXT: call otherfoo
; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 3
-; CHECK-RV64-V-NEXT: sub a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: slli a0, a0, 3
; CHECK-RV64-V-NEXT: sub a0, s0, a0
; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 3
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 4
-; CHECK-RV64-V-NEXT: sub a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: slli a0, a0, 4
; CHECK-RV64-V-NEXT: sub a0, s0, a0
; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 4
-; CHECK-RV64-V-NEXT: add a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 3
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 3
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: slli a0, a0, 3
; CHECK-RV64-V-NEXT: mv a1, a0
@@ -8609,81 +4158,12 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV64-V-NEXT: add a0, a0, a1
; CHECK-RV64-V-NEXT: sub a0, s0, a0
; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 3
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 2
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: mv a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a1, a1, a0
-; CHECK-RV64-V-NEXT: slli a0, a0, 1
-; CHECK-RV64-V-NEXT: add a0, a0, a1
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT: csrr a0, vlenb
-; CHECK-RV64-V-NEXT: slli a1, a0, 5
-; CHECK-RV64-V-NEXT: sub a0, a1, a0
-; CHECK-RV64-V-NEXT: sub a0, s0, a0
-; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-V-NEXT: csrr a0, vlenb
; CHECK-RV64-V-NEXT: slli a0, a0, 5
; CHECK-RV64-V-NEXT: sub a0, s0, a0
; CHECK-RV64-V-NEXT: addi a0, a0, -160
-; CHECK-RV64-V-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-V-NEXT: addi sp, s0, -160
; CHECK-RV64-V-NEXT: ld ra, 152(sp) # 8-byte Folded Reload
; CHECK-RV64-V-NEXT: ld t0, 144(sp) # 8-byte Folded Reload
@@ -8750,172 +4230,15 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV64-FV-NEXT: slli a0, a0, 5
; CHECK-RV64-FV-NEXT: sub sp, sp, a0
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 3
; CHECK-RV64-FV-NEXT: sub a0, s0, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 4
; CHECK-RV64-FV-NEXT: sub a0, s0, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 3
; CHECK-RV64-FV-NEXT: mv a1, a0
@@ -8923,331 +4246,36 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV64-FV-NEXT: add a0, a0, a1
; CHECK-RV64-FV-NEXT: sub a0, s0, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 5
-; CHECK-RV64-FV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 5
; CHECK-RV64-FV-NEXT: sub a0, s0, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FV-NEXT: call otherfoo
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 3
; CHECK-RV64-FV-NEXT: sub a0, s0, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 4
; CHECK-RV64-FV-NEXT: sub a0, s0, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FV-NEXT: add a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
; CHECK-RV64-FV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
; CHECK-RV64-FV-NEXT: slli a0, a0, 1
; CHECK-RV64-FV-NEXT: add a0, a0, a1
; CHECK-RV64-FV-NEXT: sub a0, s0, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: mv a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a1, a1, a0
-; CHECK-RV64-FV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FV-NEXT: add a0, a0, a1
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FV-NEXT: slli a1, a0, 5
-; CHECK-RV64-FV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FV-NEXT: csrr a0, vlenb
; CHECK-RV64-FV-NEXT: slli a0, a0, 5
; CHECK-RV64-FV-NEXT: sub a0, s0, a0
; CHECK-RV64-FV-NEXT: addi a0, a0, -240
-; CHECK-RV64-FV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FV-NEXT: addi sp, s0, -240
; CHECK-RV64-FV-NEXT: ld ra, 232(sp) # 8-byte Folded Reload
; CHECK-RV64-FV-NEXT: ld t0, 224(sp) # 8-byte Folded Reload
@@ -9334,172 +4362,15 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV64-FDV-NEXT: slli a0, a0, 5
; CHECK-RV64-FDV-NEXT: sub sp, sp, a0
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 4
; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
; CHECK-RV64-FDV-NEXT: mv a1, a0
@@ -9507,249 +4378,23 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV64-FDV-NEXT: add a0, a0, a1
; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 5
-; CHECK-RV64-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 5
; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV64-FDV-NEXT: call otherfoo
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 4
; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT: add a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
; CHECK-RV64-FDV-NEXT: mv a1, a0
@@ -9757,81 +4402,12 @@ define void @foo_fp_with_call() #2 {
; CHECK-RV64-FDV-NEXT: add a0, a0, a1
; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: mv a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a1, a1, a0
-; CHECK-RV64-FDV-NEXT: slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT: add a0, a0, a1
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT: slli a1, a0, 5
-; CHECK-RV64-FDV-NEXT: sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FDV-NEXT: csrr a0, vlenb
; CHECK-RV64-FDV-NEXT: slli a0, a0, 5
; CHECK-RV64-FDV-NEXT: sub a0, s0, a0
; CHECK-RV64-FDV-NEXT: addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV64-FDV-NEXT: addi sp, s0, -320
; CHECK-RV64-FDV-NEXT: ld ra, 312(sp) # 8-byte Folded Reload
; CHECK-RV64-FDV-NEXT: ld t0, 304(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index b1a6d16..a06c750 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -7,18 +7,18 @@
define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
; RV32-LABEL: ctz_nxv4i32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; RV32-NEXT: vid.v v10
-; RV32-NEXT: vmv.v.i v11, -1
; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; RV32-NEXT: vid.v v10
+; RV32-NEXT: li a1, -1
; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; RV32-NEXT: vmsne.vi v0, v8, 0
; RV32-NEXT: srli a0, a0, 1
; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; RV32-NEXT: vmv.v.x v8, a0
-; RV32-NEXT: vmacc.vv v8, v10, v11
-; RV32-NEXT: vmv.v.i v9, 0
-; RV32-NEXT: vmerge.vvm v8, v9, v8, v0
+; RV32-NEXT: vmadd.vx v10, a1, v8
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vvm v8, v8, v10, v0
; RV32-NEXT: vredmaxu.vs v8, v8, v8
; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: sub a0, a0, a1
@@ -28,21 +28,21 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
;
; RV64-LABEL: ctz_nxv4i32:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; RV64-NEXT: vid.v v10
-; RV64-NEXT: vmv.v.i v11, -1
; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; RV64-NEXT: vid.v v10
+; RV64-NEXT: li a1, -1
; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; RV64-NEXT: vmsne.vi v0, v8, 0
; RV64-NEXT: srli a0, a0, 1
; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: vmacc.vv v8, v10, v11
-; RV64-NEXT: vmv.v.i v9, 0
-; RV64-NEXT: vmerge.vvm v8, v9, v8, v0
+; RV64-NEXT: vmadd.vx v10, a1, v8
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vvm v8, v8, v10, v0
; RV64-NEXT: vredmaxu.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a1, v8
-; RV64-NEXT: subw a0, a0, a1
+; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: slli a0, a0, 48
; RV64-NEXT: srli a0, a0, 48
; RV64-NEXT: ret
@@ -109,17 +109,17 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
;
; RV64-LABEL: ctz_nxv8i1_no_range:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT: vid.v v16
-; RV64-NEXT: vmv.v.i v24, -1
; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT: vid.v v16
+; RV64-NEXT: li a1, -1
; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; RV64-NEXT: vmsne.vi v0, v8, 0
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: vmacc.vv v8, v16, v24
-; RV64-NEXT: vmv.v.i v16, 0
-; RV64-NEXT: vmerge.vvm v8, v16, v8, v0
+; RV64-NEXT: vmadd.vx v16, a1, v8
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
; RV64-NEXT: vredmaxu.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a1, v8
; RV64-NEXT: sub a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
index 20dd590..1216d30 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
@@ -35,7 +35,7 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
; RV64-NEXT: vredmaxu.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: li a1, 4
-; RV64-NEXT: subw a1, a1, a0
+; RV64-NEXT: sub a1, a1, a0
; RV64-NEXT: zext.b a0, a1
; RV64-NEXT: ret
%res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0)
diff --git a/llvm/test/CodeGen/RISCV/machine-combiner.ll b/llvm/test/CodeGen/RISCV/machine-combiner.ll
index 1be599e4..7a1c41c 100644
--- a/llvm/test/CodeGen/RISCV/machine-combiner.ll
+++ b/llvm/test/CodeGen/RISCV/machine-combiner.ll
@@ -454,7 +454,7 @@ define i32 @test_reassoc_add_sub_i32_1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
; CHECK-LABEL: test_reassoc_add_sub_i32_1:
; CHECK: # %bb.0:
; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: subw a2, a2, a3
+; CHECK-NEXT: sub a2, a2, a3
; CHECK-NEXT: subw a0, a0, a2
; CHECK-NEXT: ret
%t0 = add i32 %a0, %a1
@@ -467,7 +467,7 @@ define i32 @test_reassoc_add_sub_i32_2(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
; CHECK-LABEL: test_reassoc_add_sub_i32_2:
; CHECK: # %bb.0:
; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: subw a2, a2, a3
+; CHECK-NEXT: sub a2, a2, a3
; CHECK-NEXT: addw a0, a0, a2
; CHECK-NEXT: ret
%t0 = add i32 %a0, %a1
diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index 0d57e42..cd93579 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -3780,9 +3780,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
;
; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5:
; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0)
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1)
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1)
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1
@@ -3985,9 +3985,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
;
; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6:
; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0)
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1)
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1)
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1
diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll
index 0caab1f..a5bdb13 100644
--- a/llvm/test/CodeGen/RISCV/memcmp.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -4410,9 +4410,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
;
; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5:
; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0)
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1)
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1)
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1
@@ -4615,9 +4615,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
;
; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6:
; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0)
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1)
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1)
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0
; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 27d5eaa..4c9a98c 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1080,14 +1080,14 @@ define i32 @muli32_m65(i32 %a) nounwind {
; RV64I-LABEL: muli32_m65:
; RV64I: # %bb.0:
; RV64I-NEXT: slli a1, a0, 6
-; RV64I-NEXT: negw a0, a0
+; RV64I-NEXT: neg a0, a0
; RV64I-NEXT: subw a0, a0, a1
; RV64I-NEXT: ret
;
; RV64IM-LABEL: muli32_m65:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a1, a0, 6
-; RV64IM-NEXT: negw a0, a0
+; RV64IM-NEXT: neg a0, a0
; RV64IM-NEXT: subw a0, a0, a1
; RV64IM-NEXT: ret
%1 = mul i32 %a, -65
@@ -1980,14 +1980,14 @@ define i8 @muladd_demand(i8 %x, i8 %y) nounwind {
; RV64I-LABEL: muladd_demand:
; RV64I: # %bb.0:
; RV64I-NEXT: slli a0, a0, 1
-; RV64I-NEXT: subw a0, a1, a0
+; RV64I-NEXT: sub a0, a1, a0
; RV64I-NEXT: andi a0, a0, 15
; RV64I-NEXT: ret
;
; RV64IM-LABEL: muladd_demand:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a0, a0, 1
-; RV64IM-NEXT: subw a0, a1, a0
+; RV64IM-NEXT: sub a0, a1, a0
; RV64IM-NEXT: andi a0, a0, 15
; RV64IM-NEXT: ret
%m = mul i8 %x, 14
@@ -2048,14 +2048,14 @@ define i8 @muladd_demand_2(i8 %x, i8 %y) nounwind {
; RV64I-LABEL: muladd_demand_2:
; RV64I: # %bb.0:
; RV64I-NEXT: slli a0, a0, 1
-; RV64I-NEXT: subw a1, a1, a0
+; RV64I-NEXT: sub a1, a1, a0
; RV64I-NEXT: ori a0, a1, -16
; RV64I-NEXT: ret
;
; RV64IM-LABEL: muladd_demand_2:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a0, a0, 1
-; RV64IM-NEXT: subw a1, a1, a0
+; RV64IM-NEXT: sub a1, a1, a0
; RV64IM-NEXT: ori a0, a1, -16
; RV64IM-NEXT: ret
%m = mul i8 %x, 14
diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll
index fe19a4fa..da81fe5 100644
--- a/llvm/test/CodeGen/RISCV/neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/neg-abs.ll
@@ -179,7 +179,7 @@ define i32 @neg_abs32_multiuse(i32 %x, ptr %y) {
; RV64I: # %bb.0:
; RV64I-NEXT: sraiw a2, a0, 31
; RV64I-NEXT: xor a0, a0, a2
-; RV64I-NEXT: subw a2, a0, a2
+; RV64I-NEXT: sub a2, a0, a2
; RV64I-NEXT: negw a0, a2
; RV64I-NEXT: sw a2, 0(a1)
; RV64I-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 47b90a0..ba6769b 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -833,7 +833,7 @@ define i1 @usubo_ugt_i32(i32 %x, i32 %y, ptr %p) {
; RV64-NEXT: sext.w a3, a1
; RV64-NEXT: sext.w a4, a0
; RV64-NEXT: sltu a3, a4, a3
-; RV64-NEXT: subw a0, a0, a1
+; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: sw a0, 0(a2)
; RV64-NEXT: mv a0, a3
; RV64-NEXT: ret
@@ -860,7 +860,7 @@ define i1 @usubo_ugt_constant_op0_i8(i8 %x, ptr %p) {
; RV64: # %bb.0:
; RV64-NEXT: zext.b a2, a0
; RV64-NEXT: li a3, 42
-; RV64-NEXT: subw a3, a3, a0
+; RV64-NEXT: sub a3, a3, a0
; RV64-NEXT: sltiu a0, a2, 43
; RV64-NEXT: xori a0, a0, 1
; RV64-NEXT: sb a3, 0(a1)
@@ -890,7 +890,7 @@ define i1 @usubo_ult_constant_op0_i16(i16 %x, ptr %p) {
; RV64-NEXT: slli a2, a0, 48
; RV64-NEXT: li a3, 43
; RV64-NEXT: srli a2, a2, 48
-; RV64-NEXT: subw a3, a3, a0
+; RV64-NEXT: sub a3, a3, a0
; RV64-NEXT: sltiu a0, a2, 44
; RV64-NEXT: xori a0, a0, 1
; RV64-NEXT: sh a3, 0(a1)
@@ -987,7 +987,7 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, ptr %p) {
; RV64-LABEL: usubo_ne_constant0_op1_i32:
; RV64: # %bb.0:
; RV64-NEXT: sext.w a2, a0
-; RV64-NEXT: negw a3, a0
+; RV64-NEXT: neg a3, a0
; RV64-NEXT: snez a0, a2
; RV64-NEXT: sw a3, 0(a1)
; RV64-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/pr145360.ll b/llvm/test/CodeGen/RISCV/pr145360.ll
index 4251ac6..1c77fad 100644
--- a/llvm/test/CodeGen/RISCV/pr145360.ll
+++ b/llvm/test/CodeGen/RISCV/pr145360.ll
@@ -8,7 +8,7 @@ define i32 @signed(i32 %0, ptr %1) {
; CHECK-NEXT: srliw a2, a2, 24
; CHECK-NEXT: add a2, a0, a2
; CHECK-NEXT: andi a2, a2, -256
-; CHECK-NEXT: subw a2, a0, a2
+; CHECK-NEXT: sub a2, a0, a2
; CHECK-NEXT: sraiw a0, a0, 8
; CHECK-NEXT: sw a2, 0(a1)
; CHECK-NEXT: ret
@@ -29,7 +29,7 @@ define i32 @unsigned(i32 %0, ptr %1) {
; CHECK-NEXT: srli a2, a2, 36
; CHECK-NEXT: slli a4, a2, 5
; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: subw a2, a2, a4
+; CHECK-NEXT: sub a2, a2, a4
; CHECK-NEXT: srliw a4, a0, 3
; CHECK-NEXT: add a2, a0, a2
; CHECK-NEXT: mulw a0, a4, a3
@@ -49,7 +49,7 @@ define i32 @signed_div_first(i32 %0, ptr %1) {
; CHECK-NEXT: add a3, a0, a2
; CHECK-NEXT: sraiw a2, a3, 8
; CHECK-NEXT: andi a3, a3, -256
-; CHECK-NEXT: subw a0, a0, a3
+; CHECK-NEXT: sub a0, a0, a3
; CHECK-NEXT: sw a0, 0(a1)
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: ret
@@ -70,7 +70,7 @@ define i32 @unsigned_div_first(i32 %0, ptr %1) {
; CHECK-NEXT: srli a2, a2, 36
; CHECK-NEXT: slli a3, a2, 5
; CHECK-NEXT: slli a4, a2, 3
-; CHECK-NEXT: subw a4, a4, a3
+; CHECK-NEXT: sub a4, a4, a3
; CHECK-NEXT: add a0, a0, a4
; CHECK-NEXT: sw a0, 0(a1)
; CHECK-NEXT: mv a0, a2
diff --git a/llvm/test/CodeGen/RISCV/pr148084.ll b/llvm/test/CodeGen/RISCV/pr148084.ll
new file mode 100644
index 0000000..9fa26c7
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr148084.ll
@@ -0,0 +1,279 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+source_filename = "external/libaom/av1/encoder/tx_search.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-android10000"
+
+define fastcc void @search_tx_type() #0 {
+; CHECK-LABEL: search_tx_type:
+; CHECK: # %bb.0: # %._crit_edge.i
+; CHECK-NEXT: # %bb.1: # %bb
+; CHECK-NEXT: lbu a1, 0(zero)
+; CHECK-NEXT: lw a0, 0(zero)
+; CHECK-NEXT: lh a2, 0(zero)
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: srai a3, a0, 63
+; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: andi a2, a1, 1
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: or a3, a3, a0
+; CHECK-NEXT: or a2, a2, a3
+; CHECK-NEXT: bgez a2, .LBB0_3
+; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: bexti a3, a1, 1
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: .LBB0_3: # %bb
+; CHECK-NEXT: andi a4, a1, 4
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: beqz a4, .LBB0_5
+; CHECK-NEXT: # %bb.4: # %bb
+; CHECK-NEXT: mv a3, a0
+; CHECK-NEXT: .LBB0_5: # %bb
+; CHECK-NEXT: blt a2, a0, .LBB0_7
+; CHECK-NEXT: # %bb.6: # %bb
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: .LBB0_7: # %bb
+; CHECK-NEXT: andi a5, a1, 8
+; CHECK-NEXT: sext.w a4, a3
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: beqz a5, .LBB0_9
+; CHECK-NEXT: # %bb.8: # %bb
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: .LBB0_9: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_11
+; CHECK-NEXT: # %bb.10: # %bb
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB0_11: # %bb
+; CHECK-NEXT: andi a5, a1, 16
+; CHECK-NEXT: sext.w a4, a2
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: beqz a5, .LBB0_13
+; CHECK-NEXT: # %bb.12: # %bb
+; CHECK-NEXT: mv a3, a0
+; CHECK-NEXT: .LBB0_13: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_15
+; CHECK-NEXT: # %bb.14: # %bb
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: .LBB0_15: # %bb
+; CHECK-NEXT: andi a5, a1, 32
+; CHECK-NEXT: sext.w a4, a3
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: beqz a5, .LBB0_17
+; CHECK-NEXT: # %bb.16: # %bb
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: .LBB0_17: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_19
+; CHECK-NEXT: # %bb.18: # %bb
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB0_19: # %bb
+; CHECK-NEXT: andi a5, a1, 64
+; CHECK-NEXT: sext.w a4, a2
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: beqz a5, .LBB0_21
+; CHECK-NEXT: # %bb.20: # %bb
+; CHECK-NEXT: mv a3, a0
+; CHECK-NEXT: .LBB0_21: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_23
+; CHECK-NEXT: # %bb.22: # %bb
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: .LBB0_23: # %bb
+; CHECK-NEXT: andi a5, a1, 128
+; CHECK-NEXT: sext.w a4, a3
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: beqz a5, .LBB0_25
+; CHECK-NEXT: # %bb.24: # %bb
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: .LBB0_25: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_27
+; CHECK-NEXT: # %bb.26: # %bb
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB0_27: # %bb
+; CHECK-NEXT: andi a5, a1, 256
+; CHECK-NEXT: sext.w a4, a2
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: beqz a5, .LBB0_29
+; CHECK-NEXT: # %bb.28: # %bb
+; CHECK-NEXT: mv a3, a0
+; CHECK-NEXT: .LBB0_29: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_31
+; CHECK-NEXT: # %bb.30: # %bb
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: .LBB0_31: # %bb
+; CHECK-NEXT: andi a5, a1, 512
+; CHECK-NEXT: sext.w a4, a3
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: beqz a5, .LBB0_33
+; CHECK-NEXT: # %bb.32: # %bb
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: .LBB0_33: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_35
+; CHECK-NEXT: # %bb.34: # %bb
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB0_35: # %bb
+; CHECK-NEXT: andi a5, a1, 1024
+; CHECK-NEXT: sext.w a4, a2
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: beqz a5, .LBB0_37
+; CHECK-NEXT: # %bb.36: # %bb
+; CHECK-NEXT: mv a3, a0
+; CHECK-NEXT: .LBB0_37: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_39
+; CHECK-NEXT: # %bb.38: # %bb
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: .LBB0_39: # %bb
+; CHECK-NEXT: slli a5, a1, 52
+; CHECK-NEXT: sext.w a4, a3
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: bgez a5, .LBB0_41
+; CHECK-NEXT: # %bb.40: # %bb
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: .LBB0_41: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_43
+; CHECK-NEXT: # %bb.42: # %bb
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB0_43: # %bb
+; CHECK-NEXT: slli a4, a1, 51
+; CHECK-NEXT: sext.w a3, a2
+; CHECK-NEXT: mv a1, a2
+; CHECK-NEXT: bltz a4, .LBB0_49
+; CHECK-NEXT: # %bb.44: # %bb
+; CHECK-NEXT: bge a3, a0, .LBB0_50
+; CHECK-NEXT: .LBB0_45: # %bb
+; CHECK-NEXT: sext.w a2, a1
+; CHECK-NEXT: blt a2, a0, .LBB0_47
+; CHECK-NEXT: .LBB0_46: # %bb
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB0_47: # %bb
+; CHECK-NEXT: sext.w a0, a0
+; CHECK-NEXT: # %bb.48: # %get_tx_mask.exit
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_49: # %bb
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: blt a3, a0, .LBB0_45
+; CHECK-NEXT: .LBB0_50: # %bb
+; CHECK-NEXT: mv a1, a2
+; CHECK-NEXT: sext.w a2, a2
+; CHECK-NEXT: bge a2, a0, .LBB0_46
+; CHECK-NEXT: j .LBB0_47
+._crit_edge.i:
+ %.in196.i = load i16, ptr null, align 2
+ %i2 = load i16, ptr null, align 2
+ %i3 = and i16 %i2, %.in196.i
+ %i9 = trunc nuw i8 0 to i1
+ br i1 %i9, label %get_tx_mask.exit, label %bb
+
+bb: ; preds = %._crit_edge.i
+ %i13 = load i8, ptr null, align 1
+ %i14 = icmp eq i8 %i13, 0
+ %spec.select211.i = select i1 %i14, i16 0, i16 %i3
+ %i19 = load i32, ptr null, align 4
+ %i20 = zext i16 %spec.select211.i to i32
+ %i21 = load i32, ptr null, align 4
+ %i22 = icmp sgt i32 %i21, -1
+ %i23 = and i32 %i20, 1
+ %.not203.i = icmp eq i32 %i23, 0
+ %spec.select212.i = select i1 %.not203.i, i32 -1, i32 %i21
+ %.1174.i = select i1 %i22, i32 %spec.select212.i, i32 -1
+ %i28 = icmp sgt i32 0, %.1174.i
+ %i29 = and i32 %i20, 2
+ %.not203.1.not.i = icmp eq i32 %i29, 0
+ %spec.select212.1.i = select i1 %.not203.1.not.i, i32 %.1174.i, i32 0
+ %.1174.1.i = select i1 %i28, i32 %spec.select212.1.i, i32 %.1174.i
+ %i30 = load i32, ptr null, align 4
+ %i31 = icmp sgt i32 %i30, %.1174.1.i
+ %i32 = and i32 %i20, 4
+ %.not203.2.i = icmp eq i32 %i32, 0
+ %spec.select212.2.i = select i1 %.not203.2.i, i32 %.1174.1.i, i32 %i30
+ %.1174.2.i = select i1 %i31, i32 %spec.select212.2.i, i32 %.1174.1.i
+ %i36 = load i32, ptr null, align 4
+ %i37 = icmp sgt i32 %i36, %.1174.2.i
+ %i38 = and i32 %i20, 8
+ %.not203.3.i = icmp eq i32 %i38, 0
+ %spec.select212.3.i = select i1 %.not203.3.i, i32 %.1174.2.i, i32 %i36
+ %.1174.3.i = select i1 %i37, i32 %spec.select212.3.i, i32 %.1174.2.i
+ %i42 = load i32, ptr null, align 4
+ %i43 = icmp sgt i32 %i42, %.1174.3.i
+ %i44 = and i32 %i20, 16
+ %.not203.4.i = icmp eq i32 %i44, 0
+ %spec.select212.4.i = select i1 %.not203.4.i, i32 %.1174.3.i, i32 %i42
+ %.1174.4.i = select i1 %i43, i32 %spec.select212.4.i, i32 %.1174.3.i
+ %i48 = load i32, ptr null, align 4
+ %i49 = icmp sgt i32 %i48, %.1174.4.i
+ %i50 = and i32 %i20, 32
+ %.not203.5.i = icmp eq i32 %i50, 0
+ %spec.select212.5.i = select i1 %.not203.5.i, i32 %.1174.4.i, i32 %i48
+ %.1174.5.i = select i1 %i49, i32 %spec.select212.5.i, i32 %.1174.4.i
+ %i51 = load i32, ptr null, align 4
+ %i52 = icmp sgt i32 %i51, %.1174.5.i
+ %i53 = and i32 %i20, 64
+ %.not203.6.i = icmp eq i32 %i53, 0
+ %spec.select212.6.i = select i1 %.not203.6.i, i32 %.1174.5.i, i32 %i51
+ %.1174.6.i = select i1 %i52, i32 %spec.select212.6.i, i32 %.1174.5.i
+ %i56 = load i32, ptr null, align 4
+ %i57 = icmp sgt i32 %i56, %.1174.6.i
+ %i58 = and i32 %i20, 128
+ %.not203.7.i = icmp eq i32 %i58, 0
+ %spec.select212.7.i = select i1 %.not203.7.i, i32 %.1174.6.i, i32 %i56
+ %.1174.7.i = select i1 %i57, i32 %spec.select212.7.i, i32 %.1174.6.i
+ %i60 = load i32, ptr null, align 4
+ %i61 = icmp sgt i32 %i60, %.1174.7.i
+ %i62 = and i32 %i20, 256
+ %.not203.8.i = icmp eq i32 %i62, 0
+ %spec.select212.8.i = select i1 %.not203.8.i, i32 %.1174.7.i, i32 %i60
+ %.1174.8.i = select i1 %i61, i32 %spec.select212.8.i, i32 %.1174.7.i
+ %i63 = load i32, ptr null, align 4
+ %i64 = icmp sgt i32 %i63, %.1174.8.i
+ %i65 = and i32 %i20, 512
+ %.not203.9.i = icmp eq i32 %i65, 0
+ %spec.select212.9.i = select i1 %.not203.9.i, i32 %.1174.8.i, i32 %i63
+ %.1174.9.i = select i1 %i64, i32 %spec.select212.9.i, i32 %.1174.8.i
+ %i67 = load i32, ptr null, align 4
+ %i68 = icmp sgt i32 %i67, %.1174.9.i
+ %i69 = and i32 %i20, 1024
+ %.not203.10.i = icmp eq i32 %i69, 0
+ %spec.select212.10.i = select i1 %.not203.10.i, i32 %.1174.9.i, i32 %i67
+ %.1174.10.i = select i1 %i68, i32 %spec.select212.10.i, i32 %.1174.9.i
+ %i70 = load i32, ptr null, align 4
+ %i71 = icmp sgt i32 %i70, %.1174.10.i
+ %i72 = and i32 %i20, 2048
+ %.not203.11.i = icmp eq i32 %i72, 0
+ %spec.select212.11.i = select i1 %.not203.11.i, i32 %.1174.10.i, i32 %i70
+ %.1174.11.i = select i1 %i71, i32 %spec.select212.11.i, i32 %.1174.10.i
+ %i75 = load i32, ptr null, align 4
+ %i76 = icmp sgt i32 %i75, %.1174.11.i
+ %i77 = and i32 %i20, 4096
+ %.not203.12.i = icmp eq i32 %i77, 0
+ %spec.select212.12.i = select i1 %.not203.12.i, i32 %.1174.11.i, i32 %i75
+ %.1174.12.i = select i1 %i76, i32 %spec.select212.12.i, i32 %.1174.11.i
+ %i80 = load i32, ptr null, align 4
+ %i81 = icmp sgt i32 %i80, %.1174.12.i
+ %spec.select212.13.i = select i1 false, i32 %.1174.12.i, i32 %i80
+ %.1174.13.i = select i1 %i81, i32 %spec.select212.13.i, i32 %.1174.12.i
+ %.1172.13.i = select i1 %i81, i32 13, i32 0
+ %i84 = icmp sgt i32 0, %.1174.13.i
+ %.1172.14.i = select i1 %i84, i32 14, i32 %.1172.13.i
+ %i88 = icmp slt i32 0, %i19
+ %i89 = select i1 %i88, i16 -32768, i16 0
+ %i90 = zext i16 %i89 to i32
+ %i91 = shl nuw nsw i32 1, %.1172.14.i
+ %i92 = and i32 %i91, %i90
+ %.not200.i = icmp eq i32 %i92, 0
+ %i93 = trunc nuw i32 %i91 to i16
+ %i94 = xor i16 %i93, -1
+ %i95 = select i1 %.not200.i, i16 -1, i16 %i94
+ %.2177.i = and i16 %i95, %i89
+ %i96 = xor i16 %.2177.i, -1
+ %i97 = and i16 %spec.select211.i, %i96
+ br label %get_tx_mask.exit
+
+get_tx_mask.exit: ; preds = %._crit_edge.i, %bb
+ %.1261.i = phi i16 [ %i97, %bb ], [ 0, %._crit_edge.i ]
+ %i99 = icmp eq i16 %.1261.i, 0
+ %.2262.i = select i1 %i99, i16 0, i16 %.1261.i
+ ret void
+}
+
+attributes #0 = { noimplicitfloat nounwind sspstrong uwtable vscale_range(2,1024) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+b,+c,+d,+f,+m,+relax,+unaligned-scalar-mem,+unaligned-vector-mem,+v,+zaamo,+zalrsc,+zba,+zbb,+zbs,+zca,+zcd,+zicsr,+zifencei,+zmmul,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-p,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqccmp,-experimental-xqcia,-experimental-xqciac,-experimental-xqcibi,-experimental-xqcibm,-experimental-xqcicli,-experimental-xqcicm,-experimental-xqcics,-experimental-xqcicsr,-experimental-xqciint,-experimental-xqciio,-experimental-xqcilb,-experimental-xqcili,-experimental-xqcilia,-experimental-xqcilo,-experimental-xqcilsm,-experimental-xqcisim,-experimental-xqcisls,-experimental-xqcisync,-experimental-xrivosvisni,-experimental-xrivosvizip,-experimental-xsfmclic,-experimental-xsfsclic,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-experimental-zvqdotq,-h,-q,-sdext,-sdtrig,-sha,-shcounterenw,-shgatpa,-shlcofideleg,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcntrpmf,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-supm,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xandesperf,-xandesvbfhcvt,-xandesvdot,-xandesvpackfph,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xmipscmov,-xmipslsp,-xsfcease,-xsfmm128t,-xsfmm16t,-xsfmm32a16f,-xsfmm32a32f,-xsfmm32a8f,-xsfmm32a8i,-xsfmm32t,-xsfmm64a64f,-xsfmm64t,-xsfmmbase,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-za64rs,-zabha,-zacas,-zama16b,-zawrs,-zbc,-zbkb,-zbkc,-zbkx,-zcb,-zce,-zcf,-zclsd,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfbfmin,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccamoc,-ziccif,-zicclsm,-ziccrse,-zicntr,-zicond,-zihintntl,-zihintpause,-zihpm,-zilsd,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-ztso,-zvbb,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
diff --git a/llvm/test/CodeGen/RISCV/prefer-w-inst.mir b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir
index e05e27a..b8ff783 100644
--- a/llvm/test/CodeGen/RISCV/prefer-w-inst.mir
+++ b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir
@@ -239,8 +239,8 @@ body: |
; NO-PREFER-W-INST-NEXT: {{ $}}
; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
- ; NO-PREFER-W-INST-NEXT: [[LWU:%[0-9]+]]:gpr = LWU [[COPY]], 0
- ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LWU]], 1
+ ; NO-PREFER-W-INST-NEXT: [[LW:%[0-9]+]]:gpr = LW [[COPY]], 0
+ ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LW]], 1
; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
; NO-PREFER-W-INST-NEXT: PseudoRET
;
diff --git a/llvm/test/CodeGen/RISCV/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
index 634cca5..cf64650 100644
--- a/llvm/test/CodeGen/RISCV/rotl-rotr.ll
+++ b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
@@ -29,7 +29,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotl_32:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -56,7 +56,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_32:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sllw a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: srlw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -78,7 +78,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotr_32:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -105,7 +105,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_32:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srlw a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: sllw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -159,7 +159,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotl_64:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -253,7 +253,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_64:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sll a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -307,7 +307,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotr_64:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -401,7 +401,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_64:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srl a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -423,7 +423,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotl_32_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: sllw a1, a0, a1
; RV64I-NEXT: srlw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -450,7 +450,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_32_mask:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sllw a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: srlw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -474,7 +474,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64I-LABEL: rotl_32_mask_and_63_and_31:
; RV64I: # %bb.0:
; RV64I-NEXT: sllw a2, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: srlw a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -500,7 +500,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sllw a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: srlw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -545,7 +545,7 @@ define i32 @rotl_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_32_mask_or_64_or_32:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sllw a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: srlw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -569,7 +569,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotr_32_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: srlw a1, a0, a1
; RV64I-NEXT: sllw a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -596,7 +596,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_32_mask:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srlw a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: sllw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -620,7 +620,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64I-LABEL: rotr_32_mask_and_63_and_31:
; RV64I: # %bb.0:
; RV64I-NEXT: srlw a2, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: sllw a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -646,7 +646,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srlw a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: sllw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -691,7 +691,7 @@ define i32 @rotr_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_32_mask_or_64_or_32:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srlw a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: sllw a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -745,7 +745,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotl_64_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -835,7 +835,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_64_mask:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sll a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -890,7 +890,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64I-LABEL: rotl_64_mask_and_127_and_63:
; RV64I: # %bb.0:
; RV64I-NEXT: sll a2, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: srl a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -981,7 +981,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sll a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -1026,7 +1026,7 @@ define i64 @rotl_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_64_mask_or_128_or_64:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sll a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -1080,7 +1080,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: rotr_64_mask:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -1170,7 +1170,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_64_mask:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srl a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -1225,7 +1225,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64I-LABEL: rotr_64_mask_and_127_and_63:
; RV64I: # %bb.0:
; RV64I-NEXT: srl a2, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: sll a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -1316,7 +1316,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srl a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -1361,7 +1361,7 @@ define i64 @rotr_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_64_mask_or_128_or_64:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srl a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -1390,7 +1390,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64I-LABEL: rotl_32_mask_shared:
; RV64I: # %bb.0:
; RV64I-NEXT: sllw a3, a0, a2
-; RV64I-NEXT: negw a4, a2
+; RV64I-NEXT: neg a4, a2
; RV64I-NEXT: srlw a0, a0, a4
; RV64I-NEXT: or a0, a3, a0
; RV64I-NEXT: sllw a1, a1, a2
@@ -1424,7 +1424,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64XTHEADBB-LABEL: rotl_32_mask_shared:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sllw a3, a0, a2
-; RV64XTHEADBB-NEXT: negw a4, a2
+; RV64XTHEADBB-NEXT: neg a4, a2
; RV64XTHEADBB-NEXT: srlw a0, a0, a4
; RV64XTHEADBB-NEXT: or a0, a3, a0
; RV64XTHEADBB-NEXT: sllw a1, a1, a2
@@ -1486,7 +1486,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64I-LABEL: rotl_64_mask_shared:
; RV64I: # %bb.0:
; RV64I-NEXT: sll a3, a0, a2
-; RV64I-NEXT: negw a4, a2
+; RV64I-NEXT: neg a4, a2
; RV64I-NEXT: srl a0, a0, a4
; RV64I-NEXT: or a0, a3, a0
; RV64I-NEXT: sll a1, a1, a2
@@ -1590,7 +1590,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64XTHEADBB-LABEL: rotl_64_mask_shared:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sll a3, a0, a2
-; RV64XTHEADBB-NEXT: negw a4, a2
+; RV64XTHEADBB-NEXT: neg a4, a2
; RV64XTHEADBB-NEXT: srl a0, a0, a4
; RV64XTHEADBB-NEXT: or a0, a3, a0
; RV64XTHEADBB-NEXT: sll a1, a1, a2
@@ -1618,7 +1618,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64I-LABEL: rotr_32_mask_shared:
; RV64I: # %bb.0:
; RV64I-NEXT: srlw a3, a0, a2
-; RV64I-NEXT: negw a4, a2
+; RV64I-NEXT: neg a4, a2
; RV64I-NEXT: sllw a0, a0, a4
; RV64I-NEXT: or a0, a3, a0
; RV64I-NEXT: sllw a1, a1, a2
@@ -1652,7 +1652,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
; RV64XTHEADBB-LABEL: rotr_32_mask_shared:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srlw a3, a0, a2
-; RV64XTHEADBB-NEXT: negw a4, a2
+; RV64XTHEADBB-NEXT: neg a4, a2
; RV64XTHEADBB-NEXT: sllw a0, a0, a4
; RV64XTHEADBB-NEXT: or a0, a3, a0
; RV64XTHEADBB-NEXT: sllw a1, a1, a2
@@ -1713,7 +1713,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64I-LABEL: rotr_64_mask_shared:
; RV64I: # %bb.0:
; RV64I-NEXT: srl a3, a0, a2
-; RV64I-NEXT: negw a4, a2
+; RV64I-NEXT: neg a4, a2
; RV64I-NEXT: sll a0, a0, a4
; RV64I-NEXT: or a0, a3, a0
; RV64I-NEXT: sll a1, a1, a2
@@ -1816,7 +1816,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
; RV64XTHEADBB-LABEL: rotr_64_mask_shared:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srl a3, a0, a2
-; RV64XTHEADBB-NEXT: negw a4, a2
+; RV64XTHEADBB-NEXT: neg a4, a2
; RV64XTHEADBB-NEXT: sll a0, a0, a4
; RV64XTHEADBB-NEXT: or a0, a3, a0
; RV64XTHEADBB-NEXT: sll a1, a1, a2
@@ -1846,7 +1846,7 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64I-LABEL: rotl_32_mask_multiple:
; RV64I: # %bb.0:
; RV64I-NEXT: sllw a3, a0, a2
-; RV64I-NEXT: negw a4, a2
+; RV64I-NEXT: neg a4, a2
; RV64I-NEXT: sllw a2, a1, a2
; RV64I-NEXT: srlw a0, a0, a4
; RV64I-NEXT: srlw a1, a1, a4
@@ -1884,7 +1884,7 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64XTHEADBB-LABEL: rotl_32_mask_multiple:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sllw a3, a0, a2
-; RV64XTHEADBB-NEXT: negw a4, a2
+; RV64XTHEADBB-NEXT: neg a4, a2
; RV64XTHEADBB-NEXT: sllw a2, a1, a2
; RV64XTHEADBB-NEXT: srlw a0, a0, a4
; RV64XTHEADBB-NEXT: srlw a1, a1, a4
@@ -1948,7 +1948,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64I-LABEL: rotl_64_mask_multiple:
; RV64I: # %bb.0:
; RV64I-NEXT: sll a3, a0, a2
-; RV64I-NEXT: negw a4, a2
+; RV64I-NEXT: neg a4, a2
; RV64I-NEXT: sll a2, a1, a2
; RV64I-NEXT: srl a0, a0, a4
; RV64I-NEXT: srl a1, a1, a4
@@ -2056,7 +2056,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64XTHEADBB-LABEL: rotl_64_mask_multiple:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sll a3, a0, a2
-; RV64XTHEADBB-NEXT: negw a4, a2
+; RV64XTHEADBB-NEXT: neg a4, a2
; RV64XTHEADBB-NEXT: sll a2, a1, a2
; RV64XTHEADBB-NEXT: srl a0, a0, a4
; RV64XTHEADBB-NEXT: srl a1, a1, a4
@@ -2087,7 +2087,7 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64I-LABEL: rotr_32_mask_multiple:
; RV64I: # %bb.0:
; RV64I-NEXT: srlw a3, a0, a2
-; RV64I-NEXT: negw a4, a2
+; RV64I-NEXT: neg a4, a2
; RV64I-NEXT: srlw a2, a1, a2
; RV64I-NEXT: sllw a0, a0, a4
; RV64I-NEXT: sllw a1, a1, a4
@@ -2125,7 +2125,7 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
; RV64XTHEADBB-LABEL: rotr_32_mask_multiple:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srlw a3, a0, a2
-; RV64XTHEADBB-NEXT: negw a4, a2
+; RV64XTHEADBB-NEXT: neg a4, a2
; RV64XTHEADBB-NEXT: srlw a2, a1, a2
; RV64XTHEADBB-NEXT: sllw a0, a0, a4
; RV64XTHEADBB-NEXT: sllw a1, a1, a4
@@ -2188,7 +2188,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64I-LABEL: rotr_64_mask_multiple:
; RV64I: # %bb.0:
; RV64I-NEXT: srl a3, a0, a2
-; RV64I-NEXT: negw a4, a2
+; RV64I-NEXT: neg a4, a2
; RV64I-NEXT: srl a2, a1, a2
; RV64I-NEXT: sll a0, a0, a4
; RV64I-NEXT: sll a1, a1, a4
@@ -2295,7 +2295,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
; RV64XTHEADBB-LABEL: rotr_64_mask_multiple:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srl a3, a0, a2
-; RV64XTHEADBB-NEXT: negw a4, a2
+; RV64XTHEADBB-NEXT: neg a4, a2
; RV64XTHEADBB-NEXT: srl a2, a1, a2
; RV64XTHEADBB-NEXT: sll a0, a0, a4
; RV64XTHEADBB-NEXT: sll a1, a1, a4
@@ -2353,7 +2353,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotl_64_zext:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: sll a1, a0, a1
; RV64I-NEXT: srl a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -2447,7 +2447,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotl_64_zext:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: sll a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: srl a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
@@ -2503,7 +2503,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
;
; RV64I-LABEL: rotr_64_zext:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: srl a1, a0, a1
; RV64I-NEXT: sll a0, a0, a2
; RV64I-NEXT: or a0, a1, a0
@@ -2597,7 +2597,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
; RV64XTHEADBB-LABEL: rotr_64_zext:
; RV64XTHEADBB: # %bb.0:
; RV64XTHEADBB-NEXT: srl a2, a0, a1
-; RV64XTHEADBB-NEXT: negw a1, a1
+; RV64XTHEADBB-NEXT: neg a1, a1
; RV64XTHEADBB-NEXT: sll a0, a0, a1
; RV64XTHEADBB-NEXT: or a0, a2, a0
; RV64XTHEADBB-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
index b8c4328..721436d 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
@@ -121,7 +121,7 @@ define signext i32 @andi_sub_cse(i32 signext %0, i32 signext %1, ptr %2) {
define signext i32 @addi_sub_cse(i32 signext %0, i32 signext %1, ptr %2) {
; CHECK-LABEL: addi_sub_cse:
; CHECK: # %bb.0:
-; CHECK-NEXT: subw a0, a0, a1
+; CHECK-NEXT: sub a0, a0, a1
; CHECK-NEXT: addiw a0, a0, -8
; CHECK-NEXT: sw a0, 0(a2)
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll b/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll
index dad20b2..6b4c253 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll
@@ -501,14 +501,14 @@ define signext i32 @sext_subw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind
define zeroext i32 @zext_subw_aext_aext(i32 %a, i32 %b) nounwind {
; RV64I-LABEL: zext_subw_aext_aext:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: zext_subw_aext_aext:
; RV64ZBA: # %bb.0:
-; RV64ZBA-NEXT: subw a0, a0, a1
+; RV64ZBA-NEXT: sub a0, a0, a1
; RV64ZBA-NEXT: zext.w a0, a0
; RV64ZBA-NEXT: ret
%1 = sub i32 %a, %b
@@ -518,14 +518,14 @@ define zeroext i32 @zext_subw_aext_aext(i32 %a, i32 %b) nounwind {
define zeroext i32 @zext_subw_aext_sext(i32 %a, i32 signext %b) nounwind {
; RV64I-LABEL: zext_subw_aext_sext:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: zext_subw_aext_sext:
; RV64ZBA: # %bb.0:
-; RV64ZBA-NEXT: subw a0, a0, a1
+; RV64ZBA-NEXT: sub a0, a0, a1
; RV64ZBA-NEXT: zext.w a0, a0
; RV64ZBA-NEXT: ret
%1 = sub i32 %a, %b
@@ -535,14 +535,14 @@ define zeroext i32 @zext_subw_aext_sext(i32 %a, i32 signext %b) nounwind {
define zeroext i32 @zext_subw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
; RV64I-LABEL: zext_subw_aext_zext:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: zext_subw_aext_zext:
; RV64ZBA: # %bb.0:
-; RV64ZBA-NEXT: subw a0, a0, a1
+; RV64ZBA-NEXT: sub a0, a0, a1
; RV64ZBA-NEXT: zext.w a0, a0
; RV64ZBA-NEXT: ret
%1 = sub i32 %a, %b
@@ -552,14 +552,14 @@ define zeroext i32 @zext_subw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
define zeroext i32 @zext_subw_sext_aext(i32 signext %a, i32 %b) nounwind {
; RV64I-LABEL: zext_subw_sext_aext:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: zext_subw_sext_aext:
; RV64ZBA: # %bb.0:
-; RV64ZBA-NEXT: subw a0, a0, a1
+; RV64ZBA-NEXT: sub a0, a0, a1
; RV64ZBA-NEXT: zext.w a0, a0
; RV64ZBA-NEXT: ret
%1 = sub i32 %a, %b
@@ -569,14 +569,14 @@ define zeroext i32 @zext_subw_sext_aext(i32 signext %a, i32 %b) nounwind {
define zeroext i32 @zext_subw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: zext_subw_sext_sext:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: zext_subw_sext_sext:
; RV64ZBA: # %bb.0:
-; RV64ZBA-NEXT: subw a0, a0, a1
+; RV64ZBA-NEXT: sub a0, a0, a1
; RV64ZBA-NEXT: zext.w a0, a0
; RV64ZBA-NEXT: ret
%1 = sub i32 %a, %b
@@ -586,14 +586,14 @@ define zeroext i32 @zext_subw_sext_sext(i32 signext %a, i32 signext %b) nounwind
define zeroext i32 @zext_subw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
; RV64I-LABEL: zext_subw_sext_zext:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: zext_subw_sext_zext:
; RV64ZBA: # %bb.0:
-; RV64ZBA-NEXT: subw a0, a0, a1
+; RV64ZBA-NEXT: sub a0, a0, a1
; RV64ZBA-NEXT: zext.w a0, a0
; RV64ZBA-NEXT: ret
%1 = sub i32 %a, %b
@@ -603,14 +603,14 @@ define zeroext i32 @zext_subw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind
define zeroext i32 @zext_subw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
; RV64I-LABEL: zext_subw_zext_aext:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: zext_subw_zext_aext:
; RV64ZBA: # %bb.0:
-; RV64ZBA-NEXT: subw a0, a0, a1
+; RV64ZBA-NEXT: sub a0, a0, a1
; RV64ZBA-NEXT: zext.w a0, a0
; RV64ZBA-NEXT: ret
%1 = sub i32 %a, %b
@@ -620,14 +620,14 @@ define zeroext i32 @zext_subw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
define zeroext i32 @zext_subw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
; RV64I-LABEL: zext_subw_zext_sext:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: zext_subw_zext_sext:
; RV64ZBA: # %bb.0:
-; RV64ZBA-NEXT: subw a0, a0, a1
+; RV64ZBA-NEXT: sub a0, a0, a1
; RV64ZBA-NEXT: zext.w a0, a0
; RV64ZBA-NEXT: ret
%1 = sub i32 %a, %b
@@ -637,14 +637,14 @@ define zeroext i32 @zext_subw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind
define zeroext i32 @zext_subw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
; RV64I-LABEL: zext_subw_zext_zext:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: zext_subw_zext_zext:
; RV64ZBA: # %bb.0:
-; RV64ZBA-NEXT: subw a0, a0, a1
+; RV64ZBA-NEXT: sub a0, a0, a1
; RV64ZBA-NEXT: zext.w a0, a0
; RV64ZBA-NEXT: ret
%1 = sub i32 %a, %b
diff --git a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
index 0782018..219a5aa 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
@@ -9,7 +9,7 @@ define signext i32 @addw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
; CHECK-NEXT: not a2, a0
; CHECK-NEXT: addi a3, a0, 1
; CHECK-NEXT: add a2, a2, a1
-; CHECK-NEXT: subw a1, a1, a0
+; CHECK-NEXT: sub a1, a1, a0
; CHECK-NEXT: addi a1, a1, -2
; CHECK-NEXT: mul a3, a2, a3
; CHECK-NEXT: slli a1, a1, 32
@@ -53,7 +53,7 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
; CHECK-NEXT: bge a0, a1, .LBB1_2
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: not a2, a0
-; CHECK-NEXT: subw a3, a1, a0
+; CHECK-NEXT: sub a3, a1, a0
; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: addi a3, a3, -2
; CHECK-NEXT: mul a2, a1, a2
@@ -61,7 +61,7 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
; CHECK-NEXT: slli a1, a1, 32
; CHECK-NEXT: mulhu a1, a1, a3
; CHECK-NEXT: srli a1, a1, 1
-; CHECK-NEXT: subw a0, a2, a0
+; CHECK-NEXT: sub a0, a2, a0
; CHECK-NEXT: subw a0, a0, a1
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB1_2:
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
index 00f7b46..81acb4f7 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
@@ -357,7 +357,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: beqz a0, .LBB6_2
; RV64I-NEXT: # %bb.1: # %cond.false
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: slli a1, a0, 6
; RV64I-NEXT: slli a2, a0, 8
@@ -365,16 +365,16 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
; RV64I-NEXT: slli a4, a0, 12
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a0, 16
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 18
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a4, a0, 4
-; RV64I-NEXT: subw a4, a0, a4
+; RV64I-NEXT: sub a4, a0, a4
; RV64I-NEXT: add a1, a4, a1
; RV64I-NEXT: slli a4, a0, 14
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 23
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a0, a0, 27
; RV64I-NEXT: add a1, a1, a3
; RV64I-NEXT: add a0, a2, a0
@@ -410,7 +410,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
; RV64I-LABEL: cttz_zero_undef_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: slli a1, a0, 6
; RV64I-NEXT: slli a2, a0, 8
@@ -418,16 +418,16 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
; RV64I-NEXT: slli a4, a0, 12
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a0, 16
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 18
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a4, a0, 4
-; RV64I-NEXT: subw a4, a0, a4
+; RV64I-NEXT: sub a4, a0, a4
; RV64I-NEXT: add a1, a4, a1
; RV64I-NEXT: slli a4, a0, 14
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 23
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a0, a0, 27
; RV64I-NEXT: add a1, a1, a3
; RV64I-NEXT: add a0, a2, a0
@@ -455,7 +455,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
; RV64I-LABEL: findFirstSet_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a2, a1, 6
; RV64I-NEXT: slli a3, a1, 8
@@ -463,16 +463,16 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: slli a5, a1, 12
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: slli a3, a1, 16
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 18
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a5, a1, 4
-; RV64I-NEXT: subw a5, a1, a5
+; RV64I-NEXT: sub a5, a1, a5
; RV64I-NEXT: add a2, a5, a2
; RV64I-NEXT: slli a5, a1, 14
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 23
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a1, a1, 27
; RV64I-NEXT: add a2, a2, a4
; RV64I-NEXT: add a1, a3, a1
@@ -508,7 +508,7 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
define signext i32 @ffs_i32(i32 signext %a) nounwind {
; RV64I-LABEL: ffs_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a2, a1, 6
; RV64I-NEXT: slli a3, a1, 8
@@ -516,16 +516,16 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
; RV64I-NEXT: slli a5, a1, 12
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: slli a3, a1, 16
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 18
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a5, a1, 4
-; RV64I-NEXT: subw a5, a1, a5
+; RV64I-NEXT: sub a5, a1, a5
; RV64I-NEXT: add a2, a5, a2
; RV64I-NEXT: slli a5, a1, 14
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 23
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: add a2, a2, a4
; RV64I-NEXT: lui a4, %hi(.LCPI9_0)
; RV64I-NEXT: addi a4, a4, %lo(.LCPI9_0)
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index fdff4a3..b46f7cc 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -3707,7 +3707,7 @@ define ptr @test_gep_gep_dont_crash(ptr %p, i64 %a1, i64 %a2) {
define i64 @regression(i32 signext %x, i32 signext %y) {
; RV64I-LABEL: regression:
; RV64I: # %bb.0:
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: slli a0, a0, 32
; RV64I-NEXT: srli a1, a0, 29
; RV64I-NEXT: srli a0, a0, 27
@@ -3716,14 +3716,14 @@ define i64 @regression(i32 signext %x, i32 signext %y) {
;
; RV64ZBA-LABEL: regression:
; RV64ZBA: # %bb.0:
-; RV64ZBA-NEXT: subw a0, a0, a1
+; RV64ZBA-NEXT: sub a0, a0, a1
; RV64ZBA-NEXT: slli.uw a0, a0, 3
; RV64ZBA-NEXT: sh1add a0, a0, a0
; RV64ZBA-NEXT: ret
;
; RV64XANDESPERF-LABEL: regression:
; RV64XANDESPERF: # %bb.0:
-; RV64XANDESPERF-NEXT: subw a0, a0, a1
+; RV64XANDESPERF-NEXT: sub a0, a0, a1
; RV64XANDESPERF-NEXT: slli a0, a0, 32
; RV64XANDESPERF-NEXT: srli a0, a0, 29
; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a0
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll
index 12fc98c..f2c95f8 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll
@@ -225,7 +225,7 @@ define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: rol_i32:
; RV64I: # %bb.0:
; RV64I-NEXT: sllw a2, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: srlw a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -243,7 +243,7 @@ define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
; RV64I-LABEL: rol_i32_nosext:
; RV64I: # %bb.0:
; RV64I-NEXT: sllw a3, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: srlw a0, a0, a1
; RV64I-NEXT: or a0, a3, a0
; RV64I-NEXT: sw a0, 0(a2)
@@ -263,7 +263,7 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind {
; RV64I-LABEL: rol_i32_neg_constant_rhs:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, -2
-; RV64I-NEXT: negw a2, a0
+; RV64I-NEXT: neg a2, a0
; RV64I-NEXT: sllw a0, a1, a0
; RV64I-NEXT: srlw a1, a1, a2
; RV64I-NEXT: or a0, a0, a1
@@ -284,7 +284,7 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: rol_i64:
; RV64I: # %bb.0:
; RV64I-NEXT: sll a2, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: srl a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -303,7 +303,7 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
; RV64I-LABEL: ror_i32:
; RV64I: # %bb.0:
; RV64I-NEXT: srlw a2, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: sllw a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
@@ -321,7 +321,7 @@ define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
; RV64I-LABEL: ror_i32_nosext:
; RV64I: # %bb.0:
; RV64I-NEXT: srlw a3, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: sllw a0, a0, a1
; RV64I-NEXT: or a0, a3, a0
; RV64I-NEXT: sw a0, 0(a2)
@@ -341,7 +341,7 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind {
; RV64I-LABEL: ror_i32_neg_constant_rhs:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, -2
-; RV64I-NEXT: negw a2, a0
+; RV64I-NEXT: neg a2, a0
; RV64I-NEXT: srlw a0, a1, a0
; RV64I-NEXT: sllw a1, a1, a2
; RV64I-NEXT: or a0, a0, a1
@@ -362,7 +362,7 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind {
; RV64I-LABEL: ror_i64:
; RV64I: # %bb.0:
; RV64I-NEXT: srl a2, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: sll a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index e640727..d133f9d 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -347,7 +347,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
; RV64I: # %bb.0:
; RV64I-NEXT: beqz a0, .LBB6_2
; RV64I-NEXT: # %bb.1: # %cond.false
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: slli a1, a0, 6
; RV64I-NEXT: slli a2, a0, 8
@@ -355,16 +355,16 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
; RV64I-NEXT: slli a4, a0, 12
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a0, 16
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 18
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a4, a0, 4
-; RV64I-NEXT: subw a4, a0, a4
+; RV64I-NEXT: sub a4, a0, a4
; RV64I-NEXT: add a1, a4, a1
; RV64I-NEXT: slli a4, a0, 14
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 23
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a0, a0, 27
; RV64I-NEXT: add a1, a1, a3
; RV64I-NEXT: add a0, a2, a0
@@ -390,7 +390,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
; RV64I-LABEL: cttz_zero_undef_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: slli a1, a0, 6
; RV64I-NEXT: slli a2, a0, 8
@@ -398,16 +398,16 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
; RV64I-NEXT: slli a4, a0, 12
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a0, 16
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 18
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a4, a0, 4
-; RV64I-NEXT: subw a4, a0, a4
+; RV64I-NEXT: sub a4, a0, a4
; RV64I-NEXT: add a1, a4, a1
; RV64I-NEXT: slli a4, a0, 14
-; RV64I-NEXT: subw a3, a3, a4
+; RV64I-NEXT: sub a3, a3, a4
; RV64I-NEXT: slli a4, a0, 23
-; RV64I-NEXT: subw a2, a2, a4
+; RV64I-NEXT: sub a2, a2, a4
; RV64I-NEXT: slli a0, a0, 27
; RV64I-NEXT: add a1, a1, a3
; RV64I-NEXT: add a0, a2, a0
@@ -430,7 +430,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
; RV64I-LABEL: findFirstSet_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a2, a1, 6
; RV64I-NEXT: slli a3, a1, 8
@@ -438,16 +438,16 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: slli a5, a1, 12
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: slli a3, a1, 16
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 18
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a5, a1, 4
-; RV64I-NEXT: subw a5, a1, a5
+; RV64I-NEXT: sub a5, a1, a5
; RV64I-NEXT: add a2, a5, a2
; RV64I-NEXT: slli a5, a1, 14
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 23
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a1, a1, 27
; RV64I-NEXT: add a2, a2, a4
; RV64I-NEXT: add a1, a3, a1
@@ -478,7 +478,7 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
define signext i32 @ffs_i32(i32 signext %a) nounwind {
; RV64I-LABEL: ffs_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a2, a1, 6
; RV64I-NEXT: slli a3, a1, 8
@@ -486,16 +486,16 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
; RV64I-NEXT: slli a5, a1, 12
; RV64I-NEXT: add a2, a2, a3
; RV64I-NEXT: slli a3, a1, 16
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 18
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: slli a5, a1, 4
-; RV64I-NEXT: subw a5, a1, a5
+; RV64I-NEXT: sub a5, a1, a5
; RV64I-NEXT: add a2, a5, a2
; RV64I-NEXT: slli a5, a1, 14
-; RV64I-NEXT: subw a4, a4, a5
+; RV64I-NEXT: sub a4, a4, a5
; RV64I-NEXT: slli a5, a1, 23
-; RV64I-NEXT: subw a3, a3, a5
+; RV64I-NEXT: sub a3, a3, a5
; RV64I-NEXT: add a2, a2, a4
; RV64I-NEXT: lui a4, %hi(.LCPI9_0)
; RV64I-NEXT: addi a4, a4, %lo(.LCPI9_0)
@@ -701,7 +701,7 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
;
; RV64ZBB-LABEL: ctpop_i32_load:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: lwu a0, 0(a0)
+; RV64ZBB-NEXT: lw a0, 0(a0)
; RV64ZBB-NEXT: cpopw a0, a0
; RV64ZBB-NEXT: ret
%a = load i32, ptr %p
@@ -1741,7 +1741,7 @@ define i8 @sub_if_uge_i8(i8 %x, i8 %y) {
; RV64ZBB-LABEL: sub_if_uge_i8:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: zext.b a2, a0
-; RV64ZBB-NEXT: subw a0, a0, a1
+; RV64ZBB-NEXT: sub a0, a0, a1
; RV64ZBB-NEXT: zext.b a0, a0
; RV64ZBB-NEXT: minu a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -1767,7 +1767,7 @@ define i16 @sub_if_uge_i16(i16 %x, i16 %y) {
; RV64ZBB-LABEL: sub_if_uge_i16:
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: zext.h a2, a0
-; RV64ZBB-NEXT: subw a0, a0, a1
+; RV64ZBB-NEXT: sub a0, a0, a1
; RV64ZBB-NEXT: zext.h a0, a0
; RV64ZBB-NEXT: minu a0, a2, a0
; RV64ZBB-NEXT: ret
@@ -1852,7 +1852,7 @@ define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
; CHECK-NEXT: sltu a2, a3, a2
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a1, a2, a1
-; CHECK-NEXT: subw a0, a0, a1
+; CHECK-NEXT: sub a0, a0, a1
; CHECK-NEXT: sllw a0, a0, a1
; CHECK-NEXT: ret
%cmp = icmp ult i32 %x, %y
@@ -1870,7 +1870,7 @@ define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
; RV64I-NEXT: sltu a4, a3, a2
; RV64I-NEXT: addi a4, a4, -1
; RV64I-NEXT: and a1, a4, a1
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: bltu a3, a2, .LBB68_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: li a1, 4
@@ -1980,7 +1980,7 @@ define i32 @sub_if_uge_C_i32(i32 signext %x) {
; RV64I-NEXT: lui a2, 1048560
; RV64I-NEXT: addi a1, a1, -16
; RV64I-NEXT: sltu a1, a1, a0
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: addi a2, a2, 15
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: addw a0, a0, a1
@@ -2036,7 +2036,7 @@ define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) {
; RV64I-NEXT: lui a3, 1048560
; RV64I-NEXT: addi a2, a2, -16
; RV64I-NEXT: sltu a2, a2, a0
-; RV64I-NEXT: negw a4, a2
+; RV64I-NEXT: neg a4, a2
; RV64I-NEXT: addi a3, a3, 15
; RV64I-NEXT: and a3, a4, a3
; RV64I-NEXT: addw a0, a0, a3
diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
index 696c2a5..818ea72 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
@@ -114,7 +114,7 @@ define i64 @pack_i64_3(ptr %0, ptr %1) {
; RV64ZBKB-LABEL: pack_i64_3:
; RV64ZBKB: # %bb.0:
; RV64ZBKB-NEXT: lw a0, 0(a0)
-; RV64ZBKB-NEXT: lwu a1, 0(a1)
+; RV64ZBKB-NEXT: lw a1, 0(a1)
; RV64ZBKB-NEXT: pack a0, a1, a0
; RV64ZBKB-NEXT: ret
%3 = load i32, ptr %0, align 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll
index 96c349d..d166a6e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll
@@ -92,6 +92,150 @@ entry:
ret <vscale x 1 x i32> %va
}
+define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee2(<vscale x 1 x i32> %va) nounwind {
+; SPILL-O2-LABEL: test_vector_callee2:
+; SPILL-O2: # %bb.0: # %entry
+; SPILL-O2-NEXT: addi sp, sp, -16
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: li a1, 12
+; SPILL-O2-NEXT: mul a0, a0, a1
+; SPILL-O2-NEXT: sub sp, sp, a0
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: li a1, 11
+; SPILL-O2-NEXT: mul a0, a0, a1
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: li a1, 10
+; SPILL-O2-NEXT: mul a0, a0, a1
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: slli a1, a0, 3
+; SPILL-O2-NEXT: add a0, a1, a0
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: slli a0, a0, 3
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; SPILL-O2-NEXT: addi a0, sp, 16
+; SPILL-O2-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; SPILL-O2-NEXT: #APP
+; SPILL-O2-NEXT: #NO_APP
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: li a1, 11
+; SPILL-O2-NEXT: mul a0, a0, a1
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: li a1, 10
+; SPILL-O2-NEXT: mul a0, a0, a1
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: slli a1, a0, 3
+; SPILL-O2-NEXT: add a0, a1, a0
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: slli a0, a0, 3
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; SPILL-O2-NEXT: addi a0, sp, 16
+; SPILL-O2-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: li a1, 12
+; SPILL-O2-NEXT: mul a0, a0, a1
+; SPILL-O2-NEXT: add sp, sp, a0
+; SPILL-O2-NEXT: addi sp, sp, 16
+; SPILL-O2-NEXT: ret
+entry:
+ call void asm sideeffect "",
+ "~{v1},~{v3},~{v5},~{v7},~{v24m2},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+
+ ret <vscale x 1 x i32> %va
+}
+
+define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee3(<vscale x 1 x i32> %va) nounwind {
+; SPILL-O2-LABEL: test_vector_callee3:
+; SPILL-O2: # %bb.0: # %entry
+; SPILL-O2-NEXT: addi sp, sp, -16
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: li a1, 10
+; SPILL-O2-NEXT: mul a0, a0, a1
+; SPILL-O2-NEXT: sub sp, sp, a0
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: slli a1, a0, 3
+; SPILL-O2-NEXT: add a0, a1, a0
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: slli a0, a0, 3
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: li a1, 6
+; SPILL-O2-NEXT: mul a0, a0, a1
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vs2r.v v2, (a0) # vscale x 16-byte Folded Spill
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: slli a0, a0, 2
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vs2r.v v26, (a0) # vscale x 16-byte Folded Spill
+; SPILL-O2-NEXT: addi a0, sp, 16
+; SPILL-O2-NEXT: vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
+; SPILL-O2-NEXT: #APP
+; SPILL-O2-NEXT: #NO_APP
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: slli a1, a0, 3
+; SPILL-O2-NEXT: add a0, a1, a0
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: slli a0, a0, 3
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: li a1, 6
+; SPILL-O2-NEXT: mul a0, a0, a1
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vl2r.v v2, (a0) # vscale x 16-byte Folded Reload
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: slli a0, a0, 2
+; SPILL-O2-NEXT: add a0, sp, a0
+; SPILL-O2-NEXT: addi a0, a0, 16
+; SPILL-O2-NEXT: vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; SPILL-O2-NEXT: addi a0, sp, 16
+; SPILL-O2-NEXT: vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; SPILL-O2-NEXT: csrr a0, vlenb
+; SPILL-O2-NEXT: li a1, 10
+; SPILL-O2-NEXT: mul a0, a0, a1
+; SPILL-O2-NEXT: add sp, sp, a0
+; SPILL-O2-NEXT: addi sp, sp, 16
+; SPILL-O2-NEXT: ret
+entry:
+ call void asm sideeffect "",
+ "~{v1},~{v2},~{v3},~{v24},~{v26m2},~{v28m2},~{v29},~{v30},~{v31}"()
+
+ ret <vscale x 1 x i32> %va
+}
+
; Make sure the local stack allocation pass doesn't count vector registers. The
; sizes are chosen to be on the edge of what RISCVRegister::needsFrameBaseReg
; considers to need a virtual base register.
diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
index 5b82b27..81b2b65 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
@@ -63,10 +63,10 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
; RV64-NEXT: and a2, t4, a2
; RV64-NEXT: and t0, t3, t1
; RV64-NEXT: and a7, t2, a7
-; RV64-NEXT: negw a7, a7
-; RV64-NEXT: negw t0, t0
-; RV64-NEXT: negw a2, a2
-; RV64-NEXT: negw a3, a3
+; RV64-NEXT: neg a7, a7
+; RV64-NEXT: neg t0, t0
+; RV64-NEXT: neg a2, a2
+; RV64-NEXT: neg a3, a3
; RV64-NEXT: and a4, a7, a4
; RV64-NEXT: and a6, t0, a6
; RV64-NEXT: and a1, a2, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index bdf344d..9694912 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -190,6 +190,62 @@ define {<4 x i32>, <4 x i32>} @vpload_factor2(ptr %ptr) {
ret {<4 x i32>, <4 x i32>} %res1
}
+define {<4 x i32>, <4 x i32>} @vpload_factor2_interleaved_mask_intrinsic(ptr %ptr, <4 x i1> %m) {
+; CHECK-LABEL: vpload_factor2_interleaved_mask_intrinsic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ %interleaved.mask = call <8 x i1> @llvm.vector.interleave2(<4 x i1> %m, <4 x i1> %m)
+ %interleaved.vec = tail call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> %interleaved.mask, i32 8)
+ %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ ret {<4 x i32>, <4 x i32>} %res1
+}
+
+define {<4 x i32>, <4 x i32>} @vpload_factor2_interleaved_mask_shuffle(ptr %ptr, <4 x i1> %m) {
+; CHECK-LABEL: vpload_factor2_interleaved_mask_shuffle:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ %interleaved.mask = shufflevector <4 x i1> %m, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+ %interleaved.vec = tail call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> %interleaved.mask, i32 8)
+ %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ ret {<4 x i32>, <4 x i32>} %res1
+}
+
+define {<4 x i32>, <4 x i32>} @vpload_factor2_interleaved_mask_shuffle2(ptr %ptr, <2 x i1> %m) {
+; CHECK-LABEL: vpload_factor2_interleaved_mask_shuffle2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: li a1, -1
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vwaddu.vv v9, v8, v8
+; CHECK-NEXT: vwmaccu.vx v9, a1, v8
+; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT: vmsne.vi v0, v9, 0
+; CHECK-NEXT: vle32.v v10, (a0), v0.t
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v10, 0
+; CHECK-NEXT: vnsrl.wx v9, v10, a0
+; CHECK-NEXT: ret
+ %interleaved.mask = shufflevector <2 x i1> %m, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+ %interleaved.vec = tail call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> %interleaved.mask, i32 4)
+ %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ ret {<4 x i32>, <4 x i32>} %res1
+}
define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3(ptr %ptr) {
; CHECK-LABEL: vpload_factor3:
@@ -423,8 +479,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: li a2, 32
; RV32-NEXT: lui a3, 12
; RV32-NEXT: lui a6, 12291
-; RV32-NEXT: lui a7, %hi(.LCPI20_0)
-; RV32-NEXT: addi a7, a7, %lo(.LCPI20_0)
+; RV32-NEXT: lui a7, %hi(.LCPI23_0)
+; RV32-NEXT: addi a7, a7, %lo(.LCPI23_0)
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vle32.v v24, (a5)
; RV32-NEXT: vmv.s.x v0, a3
@@ -509,12 +565,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
; RV32-NEXT: lui a7, 49164
-; RV32-NEXT: lui a1, %hi(.LCPI20_1)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI20_1)
+; RV32-NEXT: lui a1, %hi(.LCPI23_1)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI23_1)
; RV32-NEXT: lui t2, 3
; RV32-NEXT: lui t1, 196656
-; RV32-NEXT: lui a4, %hi(.LCPI20_3)
-; RV32-NEXT: addi a4, a4, %lo(.LCPI20_3)
+; RV32-NEXT: lui a4, %hi(.LCPI23_3)
+; RV32-NEXT: addi a4, a4, %lo(.LCPI23_3)
; RV32-NEXT: lui t0, 786624
; RV32-NEXT: li a5, 48
; RV32-NEXT: lui a6, 768
@@ -693,8 +749,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; RV32-NEXT: vrgatherei16.vv v24, v8, v2
-; RV32-NEXT: lui a1, %hi(.LCPI20_2)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI20_2)
+; RV32-NEXT: lui a1, %hi(.LCPI23_2)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI23_2)
; RV32-NEXT: lui a3, 3073
; RV32-NEXT: addi a3, a3, -1024
; RV32-NEXT: vmv.s.x v0, a3
@@ -758,16 +814,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: vrgatherei16.vv v28, v8, v3
; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
; RV32-NEXT: vmv.v.v v28, v24
-; RV32-NEXT: lui a1, %hi(.LCPI20_4)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI20_4)
-; RV32-NEXT: lui a2, %hi(.LCPI20_5)
-; RV32-NEXT: addi a2, a2, %lo(.LCPI20_5)
+; RV32-NEXT: lui a1, %hi(.LCPI23_4)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI23_4)
+; RV32-NEXT: lui a2, %hi(.LCPI23_5)
+; RV32-NEXT: addi a2, a2, %lo(.LCPI23_5)
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV32-NEXT: vle16.v v24, (a2)
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; RV32-NEXT: vle16.v v8, (a1)
-; RV32-NEXT: lui a1, %hi(.LCPI20_7)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI20_7)
+; RV32-NEXT: lui a1, %hi(.LCPI23_7)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI23_7)
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle16.v v10, (a1)
; RV32-NEXT: csrr a1, vlenb
@@ -795,14 +851,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vrgatherei16.vv v16, v0, v10
-; RV32-NEXT: lui a1, %hi(.LCPI20_6)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI20_6)
-; RV32-NEXT: lui a2, %hi(.LCPI20_8)
-; RV32-NEXT: addi a2, a2, %lo(.LCPI20_8)
+; RV32-NEXT: lui a1, %hi(.LCPI23_6)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI23_6)
+; RV32-NEXT: lui a2, %hi(.LCPI23_8)
+; RV32-NEXT: addi a2, a2, %lo(.LCPI23_8)
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; RV32-NEXT: vle16.v v4, (a1)
-; RV32-NEXT: lui a1, %hi(.LCPI20_9)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI20_9)
+; RV32-NEXT: lui a1, %hi(.LCPI23_9)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI23_9)
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV32-NEXT: vle16.v v6, (a1)
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
@@ -889,8 +945,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: li a4, 128
; RV64-NEXT: lui a1, 1
; RV64-NEXT: vle64.v v8, (a3)
-; RV64-NEXT: lui a3, %hi(.LCPI20_0)
-; RV64-NEXT: addi a3, a3, %lo(.LCPI20_0)
+; RV64-NEXT: lui a3, %hi(.LCPI23_0)
+; RV64-NEXT: addi a3, a3, %lo(.LCPI23_0)
; RV64-NEXT: vmv.s.x v0, a4
; RV64-NEXT: csrr a4, vlenb
; RV64-NEXT: li a5, 61
@@ -1078,8 +1134,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t
-; RV64-NEXT: lui a2, %hi(.LCPI20_1)
-; RV64-NEXT: addi a2, a2, %lo(.LCPI20_1)
+; RV64-NEXT: lui a2, %hi(.LCPI23_1)
+; RV64-NEXT: addi a2, a2, %lo(.LCPI23_1)
; RV64-NEXT: li a3, 192
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV64-NEXT: vle16.v v6, (a2)
@@ -1113,8 +1169,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vrgatherei16.vv v24, v16, v6
; RV64-NEXT: addi a2, sp, 16
; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: lui a2, %hi(.LCPI20_2)
-; RV64-NEXT: addi a2, a2, %lo(.LCPI20_2)
+; RV64-NEXT: lui a2, %hi(.LCPI23_2)
+; RV64-NEXT: addi a2, a2, %lo(.LCPI23_2)
; RV64-NEXT: li a3, 1040
; RV64-NEXT: vmv.s.x v0, a3
; RV64-NEXT: addi a1, a1, -2016
@@ -1198,12 +1254,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT: lui a1, %hi(.LCPI20_3)
-; RV64-NEXT: addi a1, a1, %lo(.LCPI20_3)
+; RV64-NEXT: lui a1, %hi(.LCPI23_3)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI23_3)
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV64-NEXT: vle16.v v20, (a1)
-; RV64-NEXT: lui a1, %hi(.LCPI20_4)
-; RV64-NEXT: addi a1, a1, %lo(.LCPI20_4)
+; RV64-NEXT: lui a1, %hi(.LCPI23_4)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI23_4)
; RV64-NEXT: vle16.v v8, (a1)
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 77
@@ -1254,8 +1310,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vrgatherei16.vv v0, v16, v8
-; RV64-NEXT: lui a1, %hi(.LCPI20_5)
-; RV64-NEXT: addi a1, a1, %lo(.LCPI20_5)
+; RV64-NEXT: lui a1, %hi(.LCPI23_5)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI23_5)
; RV64-NEXT: vle16.v v20, (a1)
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 61
@@ -1472,6 +1528,19 @@ define void @vpstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) {
ret void
}
+define void @vpstore_factor2_interleaved_mask_intrinsic(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i1> %m) {
+; CHECK-LABEL: vpstore_factor2_interleaved_mask_intrinsic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vsseg2e32.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ %interleaved.mask = call <8 x i1> @llvm.vector.interleave2(<4 x i1> %m, <4 x i1> %m)
+ %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+ tail call void @llvm.vp.store.v8i32.p0(<8 x i32> %interleaved.vec, ptr %ptr, <8 x i1> %interleaved.mask, i32 8)
+ ret void
+}
+
+
define void @vpstore_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
; CHECK-LABEL: vpstore_factor3:
; CHECK: # %bb.0:
@@ -1559,6 +1628,24 @@ define void @vpstore_factor7(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %
ret void
}
+define void @vpstore_factor7_masked(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6, <2 x i1> %m) {
+; CHECK-LABEL: vpstore_factor7_masked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vsseg7e16.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ %interleaved.mask = shufflevector <2 x i1> %m, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %s4 = shufflevector <2 x i16> %v6, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef>
+ %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
+ tail call void @llvm.vp.store.v14i16.p0(<14 x i16> %interleaved.vec, ptr %ptr, <14 x i1> %interleaved.mask, i32 14)
+ ret void
+}
+
define void @vpstore_factor8(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6, <2 x i16> %v7) {
; CHECK-LABEL: vpstore_factor8:
; CHECK: # %bb.0:
@@ -1757,8 +1844,9 @@ define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) {
define void @vpstore_factor4_one_active(ptr %ptr, <4 x i32> %v) {
; CHECK-LABEL: vpstore_factor4_one_active:
; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vsseg4e32.v v8, (a0)
+; CHECK-NEXT: vsse32.v v8, (a0), a1
; CHECK-NEXT: ret
%v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
tail call void @llvm.vp.store.v16i32.p0(<16 x i32> %v0, ptr %ptr, <16 x i1> splat (i1 true), i32 16)
@@ -1782,7 +1870,7 @@ define void @store_factor4_one_active_fullwidth(ptr %ptr, <16 x i32> %v) {
; CHECK-LABEL: store_factor4_one_active_fullwidth:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vsse32.v v8, (a0), a1
; CHECK-NEXT: ret
%v0 = shufflevector <16 x i32> %v, <16 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
@@ -1839,8 +1927,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
; RV32-NEXT: vle32.v v12, (a0), v0.t
; RV32-NEXT: li a0, 36
; RV32-NEXT: vmv.s.x v20, a1
-; RV32-NEXT: lui a1, %hi(.LCPI54_0)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI54_0)
+; RV32-NEXT: lui a1, %hi(.LCPI59_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI59_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle16.v v21, (a1)
; RV32-NEXT: vcompress.vm v8, v12, v11
@@ -1915,8 +2003,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
; RV32-NEXT: vmv.s.x v10, a0
; RV32-NEXT: li a0, 146
; RV32-NEXT: vmv.s.x v11, a0
-; RV32-NEXT: lui a0, %hi(.LCPI55_0)
-; RV32-NEXT: addi a0, a0, %lo(.LCPI55_0)
+; RV32-NEXT: lui a0, %hi(.LCPI60_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI60_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle16.v v20, (a0)
; RV32-NEXT: li a0, 36
@@ -1974,3 +2062,34 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
%res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5(ptr %ptr) {
+; CHECK-LABEL: maskedload_factor5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vlseg5e32.v v8, (a0)
+; CHECK-NEXT: ret
+ %interleaved.vec = tail call <20 x i32> @llvm.masked.load(ptr %ptr, i32 4, <20 x i1> splat (i1 true), <20 x i32> poison)
+ %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
+ %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
+ %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17>
+ %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18>
+ %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19>
+ %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+ %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3
+ %res4 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3, <4 x i32> %v4, 4
+ ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res4
+}
+
+define void @maskedstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: maskedstore_factor2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vsseg2e32.v v8, (a0)
+; CHECK-NEXT: ret
+ %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+ tail call void @llvm.masked.store(<8 x i32> %interleaved.vec, ptr %ptr, i32 4, <8 x i1> splat (i1 true))
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 07aa05f..48845c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -930,7 +930,7 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt
; CHECK-NEXT: add a2, a0, a4
; CHECK-NEXT: slli a5, a4, 2
; CHECK-NEXT: add a1, a1, a4
-; CHECK-NEXT: subw a3, a3, a4
+; CHECK-NEXT: sub a3, a3, a4
; CHECK-NEXT: add a1, a1, a5
; CHECK-NEXT: slli a3, a3, 32
; CHECK-NEXT: srli a3, a3, 32
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
index b6253c6..dcf1ab0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
@@ -204,7 +204,7 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> %
; RV64-SLOW-NEXT: # %bb.1: # %cond.load
; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m8, tu, ma
; RV64-SLOW-NEXT: vmv.x.s a1, v8
-; RV64-SLOW-NEXT: lwu a2, 4(a1)
+; RV64-SLOW-NEXT: lw a2, 4(a1)
; RV64-SLOW-NEXT: lwu a1, 0(a1)
; RV64-SLOW-NEXT: slli a2, a2, 32
; RV64-SLOW-NEXT: or a1, a2, a1
@@ -216,7 +216,7 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> %
; RV64-SLOW-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 1
; RV64-SLOW-NEXT: vmv.x.s a0, v8
-; RV64-SLOW-NEXT: lwu a1, 4(a0)
+; RV64-SLOW-NEXT: lw a1, 4(a0)
; RV64-SLOW-NEXT: lwu a0, 0(a0)
; RV64-SLOW-NEXT: slli a1, a1, 32
; RV64-SLOW-NEXT: or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
index 1a716f6..e89bac5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
@@ -818,7 +818,7 @@ define <2 x i64> @vwaddu_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; RV64-NEXT: vle32.v v9, (a0)
-; RV64-NEXT: lwu a0, 0(a1)
+; RV64-NEXT: lw a0, 0(a1)
; RV64-NEXT: vwaddu.vx v8, v9, a0
; RV64-NEXT: ret
%a = load <2 x i32>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
index 8ebd93e..b933ef9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
@@ -853,7 +853,7 @@ define <2 x i64> @vwmulsu_vx_v2i64_i32(ptr %x, ptr %y) {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; RV64-NEXT: vle32.v v9, (a0)
-; RV64-NEXT: lwu a0, 0(a1)
+; RV64-NEXT: lw a0, 0(a1)
; RV64-NEXT: vwmulsu.vx v8, v9, a0
; RV64-NEXT: ret
%a = load <2 x i32>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
index 90e9ffd..7cedee5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
@@ -710,13 +710,6 @@ define <4 x i32> @vwmulu_vx_v4i32_i8(ptr %x, ptr %y) {
}
define <4 x i32> @vwmulu_vx_v4i32_i16(ptr %x, ptr %y) {
-; CHECK-LABEL: vwmulu_vx_v4i32_i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vle16.v v9, (a0)
-; CHECK-NEXT: lhu a0, 0(a1)
-; CHECK-NEXT: vwmulu.vx v8, v9, a0
-; CHECK-NEXT: ret
%a = load <4 x i16>, ptr %x
%b = load i16, ptr %y
%c = zext i16 %b to i32
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
index bfdda47..86ac038e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
@@ -821,7 +821,7 @@ define <2 x i64> @vwsubu_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
;
; RV64-LABEL: vwsubu_vx_v2i64_i32:
; RV64: # %bb.0:
-; RV64-NEXT: lwu a1, 0(a1)
+; RV64-NEXT: lw a1, 0(a1)
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; RV64-NEXT: vle32.v v9, (a0)
; RV64-NEXT: vmv.v.x v10, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index f9ac53b..f481f9c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -274,10 +274,10 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) {
; CHECK-NOV-NEXT: sgtz a6, a2
; CHECK-NOV-NEXT: sgtz a7, a3
; CHECK-NOV-NEXT: sgtz t0, a5
-; CHECK-NOV-NEXT: negw t0, t0
-; CHECK-NOV-NEXT: negw a7, a7
-; CHECK-NOV-NEXT: negw a6, a6
-; CHECK-NOV-NEXT: negw a4, a4
+; CHECK-NOV-NEXT: neg t0, t0
+; CHECK-NOV-NEXT: neg a7, a7
+; CHECK-NOV-NEXT: neg a6, a6
+; CHECK-NOV-NEXT: neg a4, a4
; CHECK-NOV-NEXT: and a5, t0, a5
; CHECK-NOV-NEXT: and a3, a7, a3
; CHECK-NOV-NEXT: and a2, a6, a2
@@ -755,10 +755,10 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NOV-NEXT: sgtz a4, s1
; CHECK-NOV-NEXT: sgtz a5, a1
; CHECK-NOV-NEXT: sgtz a6, a3
-; CHECK-NOV-NEXT: negw a6, a6
-; CHECK-NOV-NEXT: negw a5, a5
-; CHECK-NOV-NEXT: negw a4, a4
-; CHECK-NOV-NEXT: negw a2, a2
+; CHECK-NOV-NEXT: neg a6, a6
+; CHECK-NOV-NEXT: neg a5, a5
+; CHECK-NOV-NEXT: neg a4, a4
+; CHECK-NOV-NEXT: neg a2, a2
; CHECK-NOV-NEXT: and a3, a6, a3
; CHECK-NOV-NEXT: and a1, a5, a1
; CHECK-NOV-NEXT: and a4, a4, s1
@@ -1166,10 +1166,10 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) {
; CHECK-NOV-NEXT: sgtz a6, a2
; CHECK-NOV-NEXT: sgtz a7, a3
; CHECK-NOV-NEXT: sgtz t0, a5
-; CHECK-NOV-NEXT: negw t0, t0
-; CHECK-NOV-NEXT: negw a7, a7
-; CHECK-NOV-NEXT: negw a6, a6
-; CHECK-NOV-NEXT: negw a4, a4
+; CHECK-NOV-NEXT: neg t0, t0
+; CHECK-NOV-NEXT: neg a7, a7
+; CHECK-NOV-NEXT: neg a6, a6
+; CHECK-NOV-NEXT: neg a4, a4
; CHECK-NOV-NEXT: and a5, t0, a5
; CHECK-NOV-NEXT: and a3, a7, a3
; CHECK-NOV-NEXT: and a2, a6, a2
@@ -2040,14 +2040,14 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
; CHECK-NOV-NEXT: sgtz t4, a5
; CHECK-NOV-NEXT: sgtz t5, a6
; CHECK-NOV-NEXT: sgtz t6, a7
-; CHECK-NOV-NEXT: negw t6, t6
-; CHECK-NOV-NEXT: negw t5, t5
-; CHECK-NOV-NEXT: negw t4, t4
-; CHECK-NOV-NEXT: negw t3, t3
-; CHECK-NOV-NEXT: negw t2, t2
-; CHECK-NOV-NEXT: negw t1, t1
-; CHECK-NOV-NEXT: negw t0, t0
-; CHECK-NOV-NEXT: negw a4, a4
+; CHECK-NOV-NEXT: neg t6, t6
+; CHECK-NOV-NEXT: neg t5, t5
+; CHECK-NOV-NEXT: neg t4, t4
+; CHECK-NOV-NEXT: neg t3, t3
+; CHECK-NOV-NEXT: neg t2, t2
+; CHECK-NOV-NEXT: neg t1, t1
+; CHECK-NOV-NEXT: neg t0, t0
+; CHECK-NOV-NEXT: neg a4, a4
; CHECK-NOV-NEXT: and a7, t6, a7
; CHECK-NOV-NEXT: and a6, t5, a6
; CHECK-NOV-NEXT: and a5, t4, a5
@@ -3830,16 +3830,16 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) {
; CHECK-NOV-NEXT: mv a5, a3
; CHECK-NOV-NEXT: .LBB32_5: # %entry
; CHECK-NOV-NEXT: sgtz a3, a5
-; CHECK-NOV-NEXT: negw a3, a3
+; CHECK-NOV-NEXT: neg a3, a3
; CHECK-NOV-NEXT: and a3, a3, a5
; CHECK-NOV-NEXT: sgtz a5, a4
-; CHECK-NOV-NEXT: negw a5, a5
+; CHECK-NOV-NEXT: neg a5, a5
; CHECK-NOV-NEXT: and a4, a5, a4
; CHECK-NOV-NEXT: sgtz a5, a2
-; CHECK-NOV-NEXT: negw a5, a5
+; CHECK-NOV-NEXT: neg a5, a5
; CHECK-NOV-NEXT: and a2, a5, a2
; CHECK-NOV-NEXT: sgtz a5, a1
-; CHECK-NOV-NEXT: negw a5, a5
+; CHECK-NOV-NEXT: neg a5, a5
; CHECK-NOV-NEXT: and a1, a5, a1
; CHECK-NOV-NEXT: sw a3, 0(a0)
; CHECK-NOV-NEXT: sw a4, 4(a0)
@@ -4306,16 +4306,16 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
; CHECK-NOV-NEXT: mv a3, a2
; CHECK-NOV-NEXT: .LBB35_5: # %entry
; CHECK-NOV-NEXT: sgtz a2, a3
-; CHECK-NOV-NEXT: negw a2, a2
+; CHECK-NOV-NEXT: neg a2, a2
; CHECK-NOV-NEXT: and a2, a2, a3
; CHECK-NOV-NEXT: sgtz a3, a1
-; CHECK-NOV-NEXT: negw a3, a3
+; CHECK-NOV-NEXT: neg a3, a3
; CHECK-NOV-NEXT: and a1, a3, a1
; CHECK-NOV-NEXT: sgtz a3, s1
-; CHECK-NOV-NEXT: negw a3, a3
+; CHECK-NOV-NEXT: neg a3, a3
; CHECK-NOV-NEXT: and a3, a3, s1
; CHECK-NOV-NEXT: sgtz a4, a0
-; CHECK-NOV-NEXT: negw a4, a4
+; CHECK-NOV-NEXT: neg a4, a4
; CHECK-NOV-NEXT: and a0, a4, a0
; CHECK-NOV-NEXT: sw a2, 0(s0)
; CHECK-NOV-NEXT: sw a1, 4(s0)
@@ -4707,16 +4707,16 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) {
; CHECK-NOV-NEXT: mv a5, a3
; CHECK-NOV-NEXT: .LBB41_5: # %entry
; CHECK-NOV-NEXT: sgtz a3, a5
-; CHECK-NOV-NEXT: negw a3, a3
+; CHECK-NOV-NEXT: neg a3, a3
; CHECK-NOV-NEXT: and a3, a3, a5
; CHECK-NOV-NEXT: sgtz a5, a4
-; CHECK-NOV-NEXT: negw a5, a5
+; CHECK-NOV-NEXT: neg a5, a5
; CHECK-NOV-NEXT: and a4, a5, a4
; CHECK-NOV-NEXT: sgtz a5, a2
-; CHECK-NOV-NEXT: negw a5, a5
+; CHECK-NOV-NEXT: neg a5, a5
; CHECK-NOV-NEXT: and a2, a5, a2
; CHECK-NOV-NEXT: sgtz a5, a1
-; CHECK-NOV-NEXT: negw a5, a5
+; CHECK-NOV-NEXT: neg a5, a5
; CHECK-NOV-NEXT: and a1, a5, a1
; CHECK-NOV-NEXT: sh a3, 0(a0)
; CHECK-NOV-NEXT: sh a4, 2(a0)
@@ -5572,28 +5572,28 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
; CHECK-NOV-NEXT: mv a7, a3
; CHECK-NOV-NEXT: .LBB44_9: # %entry
; CHECK-NOV-NEXT: sgtz a3, a7
-; CHECK-NOV-NEXT: negw a3, a3
+; CHECK-NOV-NEXT: neg a3, a3
; CHECK-NOV-NEXT: and a3, a3, a7
; CHECK-NOV-NEXT: sgtz a7, a6
-; CHECK-NOV-NEXT: negw a7, a7
+; CHECK-NOV-NEXT: neg a7, a7
; CHECK-NOV-NEXT: and a6, a7, a6
; CHECK-NOV-NEXT: sgtz a7, a5
-; CHECK-NOV-NEXT: negw a7, a7
+; CHECK-NOV-NEXT: neg a7, a7
; CHECK-NOV-NEXT: and a5, a7, a5
; CHECK-NOV-NEXT: sgtz a7, a4
-; CHECK-NOV-NEXT: negw a7, a7
+; CHECK-NOV-NEXT: neg a7, a7
; CHECK-NOV-NEXT: and a4, a7, a4
; CHECK-NOV-NEXT: sgtz a7, a2
-; CHECK-NOV-NEXT: negw a7, a7
+; CHECK-NOV-NEXT: neg a7, a7
; CHECK-NOV-NEXT: and a2, a7, a2
; CHECK-NOV-NEXT: sgtz a7, a1
-; CHECK-NOV-NEXT: negw a7, a7
+; CHECK-NOV-NEXT: neg a7, a7
; CHECK-NOV-NEXT: and a1, a7, a1
; CHECK-NOV-NEXT: sgtz a7, s1
-; CHECK-NOV-NEXT: negw a7, a7
+; CHECK-NOV-NEXT: neg a7, a7
; CHECK-NOV-NEXT: and a7, a7, s1
; CHECK-NOV-NEXT: sgtz t0, a0
-; CHECK-NOV-NEXT: negw t0, t0
+; CHECK-NOV-NEXT: neg t0, t0
; CHECK-NOV-NEXT: and a0, t0, a0
; CHECK-NOV-NEXT: sh a2, 8(s0)
; CHECK-NOV-NEXT: sh a1, 10(s0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll b/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll
index af2e8d3..42c2556 100644
--- a/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll
@@ -14,12 +14,8 @@ define void @foo_lmul1() nounwind #0 {
; CHECK-RV32-NEXT: csrr a0, vlenb
; CHECK-RV32-NEXT: slli a0, a0, 1
; CHECK-RV32-NEXT: sub sp, sp, a0
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
; CHECK-RV32-NEXT: addi a0, sp, 16
-; CHECK-RV32-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
; CHECK-RV32-NEXT: lui a0, %hi(a)
; CHECK-RV32-NEXT: addi a0, a0, %lo(a)
; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
@@ -31,12 +27,8 @@ define void @foo_lmul1() nounwind #0 {
; CHECK-RV32-NEXT: lui a0, %hi(c)
; CHECK-RV32-NEXT: addi a0, a0, %lo(c)
; CHECK-RV32-NEXT: vse32.v v8, (a0)
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
; CHECK-RV32-NEXT: addi a0, sp, 16
-; CHECK-RV32-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT: vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
; CHECK-RV32-NEXT: csrr a0, vlenb
; CHECK-RV32-NEXT: slli a0, a0, 1
; CHECK-RV32-NEXT: add sp, sp, a0
@@ -62,25 +54,8 @@ define void @foo_lmul2() nounwind #0 {
; CHECK-RV32-NEXT: csrr a0, vlenb
; CHECK-RV32-NEXT: slli a0, a0, 2
; CHECK-RV32-NEXT: sub sp, sp, a0
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT: slli a1, a0, 1
-; CHECK-RV32-NEXT: add a0, a1, a0
-; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
; CHECK-RV32-NEXT: addi a0, sp, 16
-; CHECK-RV32-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT: vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
; CHECK-RV32-NEXT: lui a0, %hi(d)
; CHECK-RV32-NEXT: addi a0, a0, %lo(d)
; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
@@ -92,25 +67,8 @@ define void @foo_lmul2() nounwind #0 {
; CHECK-RV32-NEXT: lui a0, %hi(f)
; CHECK-RV32-NEXT: addi a0, a0, %lo(f)
; CHECK-RV32-NEXT: vse32.v v8, (a0)
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT: slli a1, a0, 1
-; CHECK-RV32-NEXT: add a0, a1, a0
-; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
; CHECK-RV32-NEXT: addi a0, sp, 16
-; CHECK-RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT: vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
; CHECK-RV32-NEXT: csrr a0, vlenb
; CHECK-RV32-NEXT: slli a0, a0, 2
; CHECK-RV32-NEXT: add sp, sp, a0
@@ -136,56 +94,8 @@ define void @foo_lmul4() nounwind #0 {
; CHECK-RV32-NEXT: csrr a0, vlenb
; CHECK-RV32-NEXT: slli a0, a0, 3
; CHECK-RV32-NEXT: sub sp, sp, a0
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT: slli a1, a0, 3
-; CHECK-RV32-NEXT: sub a0, a1, a0
-; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT: slli a1, a0, 2
-; CHECK-RV32-NEXT: add a0, a1, a0
-; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 2
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT: slli a1, a0, 1
-; CHECK-RV32-NEXT: add a0, a1, a0
-; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
; CHECK-RV32-NEXT: addi a0, sp, 16
-; CHECK-RV32-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-NEXT: lui a0, %hi(g)
; CHECK-RV32-NEXT: addi a0, a0, %lo(g)
; CHECK-RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
@@ -197,50 +107,8 @@ define void @foo_lmul4() nounwind #0 {
; CHECK-RV32-NEXT: lui a0, %hi(i)
; CHECK-RV32-NEXT: addi a0, a0, %lo(i)
; CHECK-RV32-NEXT: vse32.v v8, (a0)
-; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a1, a0, 3
-; CHECK-RV32-NEXT: sub a0, a1, a0
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a1, a0, 2
-; CHECK-RV32-NEXT: add a0, a1, a0
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 2
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a1, a0, 1
-; CHECK-RV32-NEXT: add a0, a1, a0
-; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
; CHECK-RV32-NEXT: addi a0, sp, 16
-; CHECK-RV32-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-NEXT: csrr a0, vlenb
; CHECK-RV32-NEXT: slli a0, a0, 3
; CHECK-RV32-NEXT: add sp, sp, a0
@@ -268,108 +136,12 @@ define void @foo_lmul8() nounwind #0 {
; CHECK-RV32-NEXT: slli a0, a0, 4
; CHECK-RV32-NEXT: sub sp, sp, a0
; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a1, a0, 4
-; CHECK-RV32-NEXT: sub a0, a1, a0
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a1, a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 2
-; CHECK-RV32-NEXT: add a1, a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 2
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a1, a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 2
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 2
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a1, a0, 3
-; CHECK-RV32-NEXT: add a0, a1, a0
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
; CHECK-RV32-NEXT: slli a0, a0, 3
; CHECK-RV32-NEXT: add a0, sp, a0
; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a1, a0, 3
-; CHECK-RV32-NEXT: sub a0, a1, a0
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a1, a0, 2
-; CHECK-RV32-NEXT: add a0, a1, a0
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 2
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a1, a0, 1
-; CHECK-RV32-NEXT: add a0, a1, a0
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-NEXT: addi a0, sp, 16
-; CHECK-RV32-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-RV32-NEXT: lui a0, %hi(j)
; CHECK-RV32-NEXT: addi a0, a0, %lo(j)
; CHECK-RV32-NEXT: li a1, 32
@@ -383,108 +155,12 @@ define void @foo_lmul8() nounwind #0 {
; CHECK-RV32-NEXT: addi a0, a0, %lo(l)
; CHECK-RV32-NEXT: vse32.v v8, (a0)
; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a1, a0, 4
-; CHECK-RV32-NEXT: sub a0, a1, a0
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a1, a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 2
-; CHECK-RV32-NEXT: add a1, a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 2
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a1, a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 2
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 2
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a1, a0, 3
-; CHECK-RV32-NEXT: add a0, a1, a0
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
; CHECK-RV32-NEXT: slli a0, a0, 3
; CHECK-RV32-NEXT: add a0, sp, a0
; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a1, a0, 3
-; CHECK-RV32-NEXT: sub a0, a1, a0
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: mv a1, a0
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a1, a0, 2
-; CHECK-RV32-NEXT: add a0, a1, a0
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 2
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a1, a0, 1
-; CHECK-RV32-NEXT: add a0, a1, a0
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-NEXT: addi a0, sp, 16
-; CHECK-RV32-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-RV32-NEXT: csrr a0, vlenb
; CHECK-RV32-NEXT: slli a0, a0, 4
; CHECK-RV32-NEXT: add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll b/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll
index 4d9a6ae..749b2041 100644
--- a/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll
@@ -11,7 +11,7 @@ define i32 @vscale_known_nonzero() {
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
-; CHECK-NEXT: negw a1, a0
+; CHECK-NEXT: neg a1, a0
; CHECK-NEXT: and a0, a0, a1
; CHECK-NEXT: slli a1, a0, 6
; CHECK-NEXT: slli a2, a0, 8
@@ -19,16 +19,16 @@ define i32 @vscale_known_nonzero() {
; CHECK-NEXT: slli a4, a0, 12
; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: slli a2, a0, 16
-; CHECK-NEXT: subw a3, a3, a4
+; CHECK-NEXT: sub a3, a3, a4
; CHECK-NEXT: slli a4, a0, 18
-; CHECK-NEXT: subw a2, a2, a4
+; CHECK-NEXT: sub a2, a2, a4
; CHECK-NEXT: slli a4, a0, 4
-; CHECK-NEXT: subw a4, a0, a4
+; CHECK-NEXT: sub a4, a0, a4
; CHECK-NEXT: add a1, a4, a1
; CHECK-NEXT: slli a4, a0, 14
-; CHECK-NEXT: subw a3, a3, a4
+; CHECK-NEXT: sub a3, a3, a4
; CHECK-NEXT: slli a4, a0, 23
-; CHECK-NEXT: subw a2, a2, a4
+; CHECK-NEXT: sub a2, a2, a4
; CHECK-NEXT: slli a0, a0, 27
; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: add a0, a2, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
index a050034..a7eaf39 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
@@ -78,12 +78,12 @@ body: |
; CHECK-NEXT: %false:vrnov0 = COPY $v9
; CHECK-NEXT: %mask:vmv0 = COPY $v0
; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
- ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 8, 5 /* e32 */, 0 /* tu, mu */
+ ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 4, 5 /* e32 */, 0 /* tu, mu */
%pt:vrnov0 = COPY $v8
%false:vrnov0 = COPY $v9
%mask:vmv0 = COPY $v0
- %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
- %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 8, 5 /* e32 */
+ %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 8, 5 /* e32 */, 0 /* tu, mu */
+ %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 4, 5 /* e32 */
...
---
# Shouldn't be converted because false operands are different
@@ -163,3 +163,47 @@ body: |
%true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
bb.1:
%5:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, %mask, 4, 5 /* e32 */
+...
+---
+# Shouldn't be converted because vmerge adds back in elements from false past avl that would be lost if we converted to vmv.v.v
+name: preserve_false
+body: |
+ bb.0:
+ liveins: $v8, $v9, $v0, $x8, $x9
+ ; CHECK-LABEL: name: preserve_false
+ ; CHECK: liveins: $v8, $v9, $v0, $x8, $x9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %pt:vrnov0 = COPY $v8
+ ; CHECK-NEXT: %false:vr = COPY $v9
+ ; CHECK-NEXT: %mask:vmv0 = COPY $v0
+ ; CHECK-NEXT: %avl1:gprnox0 = COPY $x8
+ ; CHECK-NEXT: %avl2:gprnox0 = COPY $x9
+ ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */
+ ; CHECK-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */
+ %pt:vrnov0 = COPY $v8
+ %false:vr = COPY $v9
+ %mask:vmv0 = COPY $v0
+ %avl1:gprnox0 = COPY $x8
+ %avl2:gprnox0 = COPY $x9
+ %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */
+ %5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */
+...
+---
+# But we can convert this one because vmerge's avl being <= true's means we don't lose any false elements past avl.
+name: preserve_false_avl_known_le
+body: |
+ bb.0:
+ liveins: $v8, $v9, $v0
+ ; CHECK-LABEL: name: preserve_false_avl_known_le
+ ; CHECK: liveins: $v8, $v9, $v0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %pt:vr = COPY $v8
+ ; CHECK-NEXT: %false:vrnov0 = COPY $v9
+ ; CHECK-NEXT: %mask:vmv0 = COPY $v0
+ ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 1, 5 /* e32 */, 3 /* ta, ma */
+ ; CHECK-NEXT: [[PseudoVMV_V_V_M1_:%[0-9]+]]:vr = PseudoVMV_V_V_M1 %pt, %true, 1, 5 /* e32 */, 0 /* tu, mu */
+ %pt:vrnov0 = COPY $v8
+ %false:vr = COPY $v9
+ %mask:vmv0 = COPY $v0
+ %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 2, 5 /* e32 */, 3 /* ta, ma */
+ %5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 1, 5 /* e32 */
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
index 3aeb4e8..9ffc84a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
@@ -71,10 +71,31 @@ define <vscale x 8 x i64> @vpmerge_m8(<vscale x 8 x i64> %x, <vscale x 8 x i64>
ret <vscale x 8 x i64> %1
}
-declare <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1>, <vscale x 1 x i8>, <vscale x 1 x i8>, i32)
-declare <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1>, <vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1>, <vscale x 4 x i8>, <vscale x 4 x i8>, i32)
-declare <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1>, <vscale x 8 x i8>, <vscale x 8 x i8>, i32)
-declare <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1>, <vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1>, <vscale x 8 x i64>, <vscale x 8 x i64>, i32)
+; Shouldn't be converted because vmerge adds back in elements from false past avl that would be lost if we converted to vmv.v.v
+define <vscale x 2 x i32> @preserve_false(ptr %p, <vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i1> %mask, i64 %avl1, i64 %avl2) {
+; CHECK-LABEL: preserve_false:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v10, v9
+; CHECK-NEXT: vle32.v v10, (a0), v0.t
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, tu, ma
+; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0
+; CHECK-NEXT: ret
+ %true = call <vscale x 2 x i32> @llvm.riscv.vle.mask(<vscale x 2 x i32> %false, ptr %p, <vscale x 2 x i1> %mask, i64 %avl1, i64 3)
+ %res = call <vscale x 2 x i32> @llvm.riscv.vmerge(<vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i32> %true, <vscale x 2 x i1> %mask, i64 %avl2)
+ ret <vscale x 2 x i32> %res
+}
+
+; Can fold this because its avl is known to be <= than true, so no elements from false need to be introduced past avl.
+define <vscale x 2 x i32> @preserve_false_avl_known_le(ptr %p, <vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: preserve_false_avl_known_le:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT: vle32.v v9, (a0), v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma
+; CHECK-NEXT: vmv.v.v v8, v9
+; CHECK-NEXT: ret
+ %true = call <vscale x 2 x i32> @llvm.riscv.vle.mask(<vscale x 2 x i32> %false, ptr %p, <vscale x 2 x i1> %mask, i64 2, i64 3)
+ %res = call <vscale x 2 x i32> @llvm.riscv.vmerge(<vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i32> %true, <vscale x 2 x i1> %mask, i64 1)
+ ret <vscale x 2 x i32> %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 8495dfe..32892bc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \
; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
-; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: --check-prefixes=CHECK,CHECK32,ZVFH
; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \
; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
-; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: --check-prefixes=CHECK,CHECK64,ZVFH
; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
-; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: --check-prefixes=CHECK,CHECK32,ZVFHMIN
; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
-; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: --check-prefixes=CHECK,CHECK64,ZVFHMIN
declare <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, metadata, <vscale x 1 x i1>, i32)
@@ -4820,6 +4820,427 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8f64(<vscale x 8 x double> %va, do
declare <vscale x 32 x i1> @llvm.vp.fcmp.nxv32f64(<vscale x 32 x double>, <vscale x 32 x double>, metadata, <vscale x 32 x i1>, i32)
define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vscale x 32 x double> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK32-LABEL: fcmp_oeq_vv_nxv32f64:
+; CHECK32: # %bb.0:
+; CHECK32-NEXT: addi sp, sp, -48
+; CHECK32-NEXT: .cfi_def_cfa_offset 48
+; CHECK32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
+; CHECK32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
+; CHECK32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
+; CHECK32-NEXT: sw s2, 32(sp) # 4-byte Folded Spill
+; CHECK32-NEXT: sw s3, 28(sp) # 4-byte Folded Spill
+; CHECK32-NEXT: sw s4, 24(sp) # 4-byte Folded Spill
+; CHECK32-NEXT: .cfi_offset ra, -4
+; CHECK32-NEXT: .cfi_offset s0, -8
+; CHECK32-NEXT: .cfi_offset s1, -12
+; CHECK32-NEXT: .cfi_offset s2, -16
+; CHECK32-NEXT: .cfi_offset s3, -20
+; CHECK32-NEXT: .cfi_offset s4, -24
+; CHECK32-NEXT: csrr a1, vlenb
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: mv a3, a1
+; CHECK32-NEXT: slli a1, a1, 2
+; CHECK32-NEXT: add a3, a3, a1
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: add a1, a1, a3
+; CHECK32-NEXT: sub sp, sp, a1
+; CHECK32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x1a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 26 * vlenb
+; CHECK32-NEXT: mv s1, a6
+; CHECK32-NEXT: csrr a1, vlenb
+; CHECK32-NEXT: add a1, sp, a1
+; CHECK32-NEXT: addi a1, a1, 16
+; CHECK32-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT: mv s3, a2
+; CHECK32-NEXT: mv s2, a0
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a1, a0, 3
+; CHECK32-NEXT: add a0, a1, a0
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: mv a1, a0
+; CHECK32-NEXT: slli a0, a0, 3
+; CHECK32-NEXT: add a0, a0, a1
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT: csrr s0, vlenb
+; CHECK32-NEXT: li a1, 24
+; CHECK32-NEXT: mv a0, s0
+; CHECK32-NEXT: call __mulsi3
+; CHECK32-NEXT: csrr a1, vlenb
+; CHECK32-NEXT: add a1, sp, a1
+; CHECK32-NEXT: addi a1, a1, 16
+; CHECK32-NEXT: vl1r.v v6, (a1) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: mv a1, a0
+; CHECK32-NEXT: slli a4, s0, 3
+; CHECK32-NEXT: srli s4, s0, 2
+; CHECK32-NEXT: srli a0, s0, 3
+; CHECK32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK32-NEXT: vslidedown.vx v7, v6, s4
+; CHECK32-NEXT: add a2, s3, a4
+; CHECK32-NEXT: vl8re64.v v16, (a2)
+; CHECK32-NEXT: slli a6, s0, 4
+; CHECK32-NEXT: slli a2, s0, 1
+; CHECK32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK32-NEXT: vslidedown.vx v0, v6, a0
+; CHECK32-NEXT: mv a3, s1
+; CHECK32-NEXT: bltu s1, a2, .LBB257_2
+; CHECK32-NEXT: # %bb.1:
+; CHECK32-NEXT: mv a3, a2
+; CHECK32-NEXT: .LBB257_2:
+; CHECK32-NEXT: add a5, s3, a1
+; CHECK32-NEXT: add a1, s2, a4
+; CHECK32-NEXT: vslidedown.vx v9, v7, a0
+; CHECK32-NEXT: csrr a4, vlenb
+; CHECK32-NEXT: slli a7, a4, 4
+; CHECK32-NEXT: add a4, a7, a4
+; CHECK32-NEXT: add a4, sp, a4
+; CHECK32-NEXT: addi a4, a4, 16
+; CHECK32-NEXT: vs1r.v v9, (a4) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT: add a4, s3, a6
+; CHECK32-NEXT: vl8re64.v v24, (s3)
+; CHECK32-NEXT: sub a6, a3, s0
+; CHECK32-NEXT: sltu a7, a3, a6
+; CHECK32-NEXT: addi a7, a7, -1
+; CHECK32-NEXT: and a6, a7, a6
+; CHECK32-NEXT: csrr a7, vlenb
+; CHECK32-NEXT: slli t0, a7, 3
+; CHECK32-NEXT: add a7, t0, a7
+; CHECK32-NEXT: add a7, sp, a7
+; CHECK32-NEXT: addi a7, a7, 16
+; CHECK32-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; CHECK32-NEXT: vmfeq.vv v5, v8, v16, v0.t
+; CHECK32-NEXT: bltu a3, s0, .LBB257_4
+; CHECK32-NEXT: # %bb.3:
+; CHECK32-NEXT: mv a3, s0
+; CHECK32-NEXT: .LBB257_4:
+; CHECK32-NEXT: vmv1r.v v0, v6
+; CHECK32-NEXT: vl8re64.v v8, (a5)
+; CHECK32-NEXT: csrr a5, vlenb
+; CHECK32-NEXT: slli a6, a5, 3
+; CHECK32-NEXT: add a5, a6, a5
+; CHECK32-NEXT: add a5, sp, a5
+; CHECK32-NEXT: addi a5, a5, 16
+; CHECK32-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT: csrr a5, vlenb
+; CHECK32-NEXT: slli a5, a5, 1
+; CHECK32-NEXT: mv a6, a5
+; CHECK32-NEXT: slli a5, a5, 3
+; CHECK32-NEXT: add a5, a5, a6
+; CHECK32-NEXT: add a5, sp, a5
+; CHECK32-NEXT: addi a5, a5, 16
+; CHECK32-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK32-NEXT: vmfeq.vv v8, v16, v24, v0.t
+; CHECK32-NEXT: vl8re64.v v16, (a1)
+; CHECK32-NEXT: csrr a1, vlenb
+; CHECK32-NEXT: add a1, sp, a1
+; CHECK32-NEXT: addi a1, a1, 16
+; CHECK32-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT: vl8re64.v v16, (a4)
+; CHECK32-NEXT: sub a1, s1, a2
+; CHECK32-NEXT: sltu a2, s1, a1
+; CHECK32-NEXT: vl8re64.v v24, (s2)
+; CHECK32-NEXT: addi a2, a2, -1
+; CHECK32-NEXT: and s1, a2, a1
+; CHECK32-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
+; CHECK32-NEXT: vslideup.vx v8, v5, a0
+; CHECK32-NEXT: csrr a1, vlenb
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: mv a2, a1
+; CHECK32-NEXT: slli a1, a1, 3
+; CHECK32-NEXT: add a1, a1, a2
+; CHECK32-NEXT: add a1, sp, a1
+; CHECK32-NEXT: addi a1, a1, 16
+; CHECK32-NEXT: vs1r.v v8, (a1) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT: mv a1, s1
+; CHECK32-NEXT: bltu s1, s0, .LBB257_6
+; CHECK32-NEXT: # %bb.5:
+; CHECK32-NEXT: mv a1, s0
+; CHECK32-NEXT: .LBB257_6:
+; CHECK32-NEXT: vmv1r.v v0, v7
+; CHECK32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK32-NEXT: vmfeq.vv v8, v24, v16, v0.t
+; CHECK32-NEXT: addi a1, sp, 16
+; CHECK32-NEXT: vs1r.v v8, (a1) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT: li a1, 3
+; CHECK32-NEXT: call __mulsi3
+; CHECK32-NEXT: csrr a1, vlenb
+; CHECK32-NEXT: slli a2, a1, 4
+; CHECK32-NEXT: add a1, a2, a1
+; CHECK32-NEXT: add a1, sp, a1
+; CHECK32-NEXT: addi a1, a1, 16
+; CHECK32-NEXT: vl1r.v v0, (a1) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: csrr a1, vlenb
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: mv a2, a1
+; CHECK32-NEXT: slli a1, a1, 3
+; CHECK32-NEXT: add a1, a1, a2
+; CHECK32-NEXT: add a1, sp, a1
+; CHECK32-NEXT: addi a1, a1, 16
+; CHECK32-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: addi a1, sp, 16
+; CHECK32-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK32-NEXT: vslideup.vx v9, v8, s4
+; CHECK32-NEXT: sub a1, s1, s0
+; CHECK32-NEXT: sltu a2, s1, a1
+; CHECK32-NEXT: addi a2, a2, -1
+; CHECK32-NEXT: and a1, a2, a1
+; CHECK32-NEXT: csrr a2, vlenb
+; CHECK32-NEXT: slli a3, a2, 3
+; CHECK32-NEXT: add a2, a3, a2
+; CHECK32-NEXT: add a2, sp, a2
+; CHECK32-NEXT: addi a2, a2, 16
+; CHECK32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: csrr a2, vlenb
+; CHECK32-NEXT: add a2, sp, a2
+; CHECK32-NEXT: addi a2, a2, 16
+; CHECK32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK32-NEXT: vmfeq.vv v8, v24, v16, v0.t
+; CHECK32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK32-NEXT: vslideup.vx v9, v8, a0
+; CHECK32-NEXT: vmv1r.v v0, v9
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: mv a1, a0
+; CHECK32-NEXT: slli a0, a0, 2
+; CHECK32-NEXT: add a1, a1, a0
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: add a0, a0, a1
+; CHECK32-NEXT: add sp, sp, a0
+; CHECK32-NEXT: .cfi_def_cfa sp, 48
+; CHECK32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
+; CHECK32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
+; CHECK32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
+; CHECK32-NEXT: lw s2, 32(sp) # 4-byte Folded Reload
+; CHECK32-NEXT: lw s3, 28(sp) # 4-byte Folded Reload
+; CHECK32-NEXT: lw s4, 24(sp) # 4-byte Folded Reload
+; CHECK32-NEXT: .cfi_restore ra
+; CHECK32-NEXT: .cfi_restore s0
+; CHECK32-NEXT: .cfi_restore s1
+; CHECK32-NEXT: .cfi_restore s2
+; CHECK32-NEXT: .cfi_restore s3
+; CHECK32-NEXT: .cfi_restore s4
+; CHECK32-NEXT: addi sp, sp, 48
+; CHECK32-NEXT: .cfi_def_cfa_offset 0
+; CHECK32-NEXT: ret
+;
+; CHECK64-LABEL: fcmp_oeq_vv_nxv32f64:
+; CHECK64: # %bb.0:
+; CHECK64-NEXT: addi sp, sp, -64
+; CHECK64-NEXT: .cfi_def_cfa_offset 64
+; CHECK64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; CHECK64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; CHECK64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
+; CHECK64-NEXT: sd s2, 32(sp) # 8-byte Folded Spill
+; CHECK64-NEXT: sd s3, 24(sp) # 8-byte Folded Spill
+; CHECK64-NEXT: sd s4, 16(sp) # 8-byte Folded Spill
+; CHECK64-NEXT: .cfi_offset ra, -8
+; CHECK64-NEXT: .cfi_offset s0, -16
+; CHECK64-NEXT: .cfi_offset s1, -24
+; CHECK64-NEXT: .cfi_offset s2, -32
+; CHECK64-NEXT: .cfi_offset s3, -40
+; CHECK64-NEXT: .cfi_offset s4, -48
+; CHECK64-NEXT: csrr a1, vlenb
+; CHECK64-NEXT: slli a1, a1, 1
+; CHECK64-NEXT: mv a3, a1
+; CHECK64-NEXT: slli a1, a1, 2
+; CHECK64-NEXT: add a3, a3, a1
+; CHECK64-NEXT: slli a1, a1, 1
+; CHECK64-NEXT: add a1, a1, a3
+; CHECK64-NEXT: sub sp, sp, a1
+; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x1a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 26 * vlenb
+; CHECK64-NEXT: mv s1, a6
+; CHECK64-NEXT: csrr a1, vlenb
+; CHECK64-NEXT: add a1, sp, a1
+; CHECK64-NEXT: addi a1, a1, 16
+; CHECK64-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT: mv s3, a2
+; CHECK64-NEXT: mv s2, a0
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: slli a1, a0, 3
+; CHECK64-NEXT: add a0, a1, a0
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: mv a1, a0
+; CHECK64-NEXT: slli a0, a0, 3
+; CHECK64-NEXT: add a0, a0, a1
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT: csrr s0, vlenb
+; CHECK64-NEXT: li a1, 24
+; CHECK64-NEXT: mv a0, s0
+; CHECK64-NEXT: call __muldi3
+; CHECK64-NEXT: csrr a1, vlenb
+; CHECK64-NEXT: add a1, sp, a1
+; CHECK64-NEXT: addi a1, a1, 16
+; CHECK64-NEXT: vl1r.v v6, (a1) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT: mv a1, a0
+; CHECK64-NEXT: slli a4, s0, 3
+; CHECK64-NEXT: srli s4, s0, 2
+; CHECK64-NEXT: srli a0, s0, 3
+; CHECK64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK64-NEXT: vslidedown.vx v7, v6, s4
+; CHECK64-NEXT: add a2, s3, a4
+; CHECK64-NEXT: vl8re64.v v16, (a2)
+; CHECK64-NEXT: slli a6, s0, 4
+; CHECK64-NEXT: slli a2, s0, 1
+; CHECK64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK64-NEXT: vslidedown.vx v0, v6, a0
+; CHECK64-NEXT: mv a3, s1
+; CHECK64-NEXT: bltu s1, a2, .LBB257_2
+; CHECK64-NEXT: # %bb.1:
+; CHECK64-NEXT: mv a3, a2
+; CHECK64-NEXT: .LBB257_2:
+; CHECK64-NEXT: add a5, s3, a1
+; CHECK64-NEXT: add a1, s2, a4
+; CHECK64-NEXT: vslidedown.vx v9, v7, a0
+; CHECK64-NEXT: csrr a4, vlenb
+; CHECK64-NEXT: slli a7, a4, 4
+; CHECK64-NEXT: add a4, a7, a4
+; CHECK64-NEXT: add a4, sp, a4
+; CHECK64-NEXT: addi a4, a4, 16
+; CHECK64-NEXT: vs1r.v v9, (a4) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT: add a4, s3, a6
+; CHECK64-NEXT: vl8re64.v v24, (s3)
+; CHECK64-NEXT: sub a6, a3, s0
+; CHECK64-NEXT: sltu a7, a3, a6
+; CHECK64-NEXT: addi a7, a7, -1
+; CHECK64-NEXT: and a6, a7, a6
+; CHECK64-NEXT: csrr a7, vlenb
+; CHECK64-NEXT: slli t0, a7, 3
+; CHECK64-NEXT: add a7, t0, a7
+; CHECK64-NEXT: add a7, sp, a7
+; CHECK64-NEXT: addi a7, a7, 16
+; CHECK64-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; CHECK64-NEXT: vmfeq.vv v5, v8, v16, v0.t
+; CHECK64-NEXT: bltu a3, s0, .LBB257_4
+; CHECK64-NEXT: # %bb.3:
+; CHECK64-NEXT: mv a3, s0
+; CHECK64-NEXT: .LBB257_4:
+; CHECK64-NEXT: vmv1r.v v0, v6
+; CHECK64-NEXT: vl8re64.v v8, (a5)
+; CHECK64-NEXT: csrr a5, vlenb
+; CHECK64-NEXT: slli a6, a5, 3
+; CHECK64-NEXT: add a5, a6, a5
+; CHECK64-NEXT: add a5, sp, a5
+; CHECK64-NEXT: addi a5, a5, 16
+; CHECK64-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT: csrr a5, vlenb
+; CHECK64-NEXT: slli a5, a5, 1
+; CHECK64-NEXT: mv a6, a5
+; CHECK64-NEXT: slli a5, a5, 3
+; CHECK64-NEXT: add a5, a5, a6
+; CHECK64-NEXT: add a5, sp, a5
+; CHECK64-NEXT: addi a5, a5, 16
+; CHECK64-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK64-NEXT: vmfeq.vv v8, v16, v24, v0.t
+; CHECK64-NEXT: vl8re64.v v16, (a1)
+; CHECK64-NEXT: csrr a1, vlenb
+; CHECK64-NEXT: add a1, sp, a1
+; CHECK64-NEXT: addi a1, a1, 16
+; CHECK64-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT: vl8re64.v v16, (a4)
+; CHECK64-NEXT: sub a1, s1, a2
+; CHECK64-NEXT: sltu a2, s1, a1
+; CHECK64-NEXT: vl8re64.v v24, (s2)
+; CHECK64-NEXT: addi a2, a2, -1
+; CHECK64-NEXT: and s1, a2, a1
+; CHECK64-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
+; CHECK64-NEXT: vslideup.vx v8, v5, a0
+; CHECK64-NEXT: csrr a1, vlenb
+; CHECK64-NEXT: slli a1, a1, 1
+; CHECK64-NEXT: mv a2, a1
+; CHECK64-NEXT: slli a1, a1, 3
+; CHECK64-NEXT: add a1, a1, a2
+; CHECK64-NEXT: add a1, sp, a1
+; CHECK64-NEXT: addi a1, a1, 16
+; CHECK64-NEXT: vs1r.v v8, (a1) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT: mv a1, s1
+; CHECK64-NEXT: bltu s1, s0, .LBB257_6
+; CHECK64-NEXT: # %bb.5:
+; CHECK64-NEXT: mv a1, s0
+; CHECK64-NEXT: .LBB257_6:
+; CHECK64-NEXT: vmv1r.v v0, v7
+; CHECK64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK64-NEXT: vmfeq.vv v8, v24, v16, v0.t
+; CHECK64-NEXT: addi a1, sp, 16
+; CHECK64-NEXT: vs1r.v v8, (a1) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT: li a1, 3
+; CHECK64-NEXT: call __muldi3
+; CHECK64-NEXT: csrr a1, vlenb
+; CHECK64-NEXT: slli a2, a1, 4
+; CHECK64-NEXT: add a1, a2, a1
+; CHECK64-NEXT: add a1, sp, a1
+; CHECK64-NEXT: addi a1, a1, 16
+; CHECK64-NEXT: vl1r.v v0, (a1) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT: csrr a1, vlenb
+; CHECK64-NEXT: slli a1, a1, 1
+; CHECK64-NEXT: mv a2, a1
+; CHECK64-NEXT: slli a1, a1, 3
+; CHECK64-NEXT: add a1, a1, a2
+; CHECK64-NEXT: add a1, sp, a1
+; CHECK64-NEXT: addi a1, a1, 16
+; CHECK64-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT: addi a1, sp, 16
+; CHECK64-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT: vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK64-NEXT: vslideup.vx v9, v8, s4
+; CHECK64-NEXT: sub a1, s1, s0
+; CHECK64-NEXT: sltu a2, s1, a1
+; CHECK64-NEXT: addi a2, a2, -1
+; CHECK64-NEXT: and a1, a2, a1
+; CHECK64-NEXT: csrr a2, vlenb
+; CHECK64-NEXT: slli a3, a2, 3
+; CHECK64-NEXT: add a2, a3, a2
+; CHECK64-NEXT: add a2, sp, a2
+; CHECK64-NEXT: addi a2, a2, 16
+; CHECK64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: csrr a2, vlenb
+; CHECK64-NEXT: add a2, sp, a2
+; CHECK64-NEXT: addi a2, a2, 16
+; CHECK64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK64-NEXT: vmfeq.vv v8, v24, v16, v0.t
+; CHECK64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK64-NEXT: vslideup.vx v9, v8, a0
+; CHECK64-NEXT: vmv1r.v v0, v9
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: mv a1, a0
+; CHECK64-NEXT: slli a0, a0, 2
+; CHECK64-NEXT: add a1, a1, a0
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: add a0, a0, a1
+; CHECK64-NEXT: add sp, sp, a0
+; CHECK64-NEXT: .cfi_def_cfa sp, 64
+; CHECK64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; CHECK64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; CHECK64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
+; CHECK64-NEXT: ld s2, 32(sp) # 8-byte Folded Reload
+; CHECK64-NEXT: ld s3, 24(sp) # 8-byte Folded Reload
+; CHECK64-NEXT: ld s4, 16(sp) # 8-byte Folded Reload
+; CHECK64-NEXT: .cfi_restore ra
+; CHECK64-NEXT: .cfi_restore s0
+; CHECK64-NEXT: .cfi_restore s1
+; CHECK64-NEXT: .cfi_restore s2
+; CHECK64-NEXT: .cfi_restore s3
+; CHECK64-NEXT: .cfi_restore s4
+; CHECK64-NEXT: addi sp, sp, 64
+; CHECK64-NEXT: .cfi_def_cfa_offset 0
+; CHECK64-NEXT: ret
%v = call <vscale x 32 x i1> @llvm.vp.fcmp.nxv32f64(<vscale x 32 x double> %va, <vscale x 32 x double> %vb, metadata !"oeq", <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x i1> %v
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index c216fb6..346e40a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -549,7 +549,7 @@ define void @sink_splat_rsub_scalable(ptr nocapture %a, i32 signext %x) {
; CHECK-NEXT: .LBB10_6: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: lw a3, 0(a2)
-; CHECK-NEXT: subw a3, a1, a3
+; CHECK-NEXT: sub a3, a1, a3
; CHECK-NEXT: sw a3, 0(a2)
; CHECK-NEXT: addi a2, a2, 4
; CHECK-NEXT: bne a2, a0, .LBB10_6
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
index 66e114c..f295bd8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
@@ -2300,7 +2300,7 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
; CHECK-RV64-NEXT: j .LBB98_5
; CHECK-RV64-NEXT: .LBB98_2: # %vector.ph
; CHECK-RV64-NEXT: srli a3, a4, 1
-; CHECK-RV64-NEXT: negw a2, a3
+; CHECK-RV64-NEXT: neg a2, a3
; CHECK-RV64-NEXT: andi a2, a2, 256
; CHECK-RV64-NEXT: slli a4, a4, 1
; CHECK-RV64-NEXT: mv a5, a0
@@ -2393,7 +2393,7 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
; CHECK-ZVKB-NOZBB64-NEXT: j .LBB98_5
; CHECK-ZVKB-NOZBB64-NEXT: .LBB98_2: # %vector.ph
; CHECK-ZVKB-NOZBB64-NEXT: srli a3, a4, 1
-; CHECK-ZVKB-NOZBB64-NEXT: negw a2, a3
+; CHECK-ZVKB-NOZBB64-NEXT: neg a2, a3
; CHECK-ZVKB-NOZBB64-NEXT: andi a2, a2, 256
; CHECK-ZVKB-NOZBB64-NEXT: slli a4, a4, 1
; CHECK-ZVKB-NOZBB64-NEXT: mv a5, a0
@@ -2485,7 +2485,7 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
; CHECK-ZVKB-ZBB64-NEXT: j .LBB98_5
; CHECK-ZVKB-ZBB64-NEXT: .LBB98_2: # %vector.ph
; CHECK-ZVKB-ZBB64-NEXT: srli a3, a4, 1
-; CHECK-ZVKB-ZBB64-NEXT: negw a2, a3
+; CHECK-ZVKB-ZBB64-NEXT: neg a2, a3
; CHECK-ZVKB-ZBB64-NEXT: andi a2, a2, 256
; CHECK-ZVKB-ZBB64-NEXT: slli a4, a4, 1
; CHECK-ZVKB-ZBB64-NEXT: mv a5, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
index 3740737..d0b184b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
@@ -50,9 +50,9 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
; RV64-NEXT: sgtz a5, a5
; RV64-NEXT: sgtz a4, a4
; RV64-NEXT: sgtz a3, a3
-; RV64-NEXT: negw a3, a3
-; RV64-NEXT: negw a4, a4
-; RV64-NEXT: negw a5, a5
+; RV64-NEXT: neg a3, a3
+; RV64-NEXT: neg a4, a4
+; RV64-NEXT: neg a5, a5
; RV64-NEXT: and a3, a3, a6
; RV64-NEXT: and a0, a4, a0
; RV64-NEXT: and a2, a5, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 578b67e..f9f0aa6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -542,95 +542,30 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
define {<vscale x 16 x i8>, <vscale x 16 x i8>} @masked_load_factor2(ptr %p) {
; CHECK-LABEL: masked_load_factor2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl4r.v v12, (a0)
-; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vnsrl.wi v8, v12, 0
-; CHECK-NEXT: vnsrl.wi v10, v12, 8
+; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT: vlseg2e8.v v8, (a0)
; CHECK-NEXT: ret
%vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison)
%deinterleaved.results = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %deinterleaved.results
}
-define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4(ptr %p) {
-; CHECK-LABEL: masked_loat_factor4:
+define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_load_factor4(ptr %p) {
+; CHECK-LABEL: masked_load_factor4:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; CHECK-NEXT: vl4r.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs4r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg4e8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison)
%deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec)
ret {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %deinterleaved.results
}
-define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4_mask(ptr %p, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: masked_loat_factor4_mask:
+define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_load_factor4_mask(ptr %p, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: masked_load_factor4_mask:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT: add a3, a1, a2
-; CHECK-NEXT: vmv.v.v v9, v8
-; CHECK-NEXT: srli a4, a2, 2
-; CHECK-NEXT: vmv.v.v v10, v8
-; CHECK-NEXT: srli a5, a2, 3
-; CHECK-NEXT: vmv.v.v v11, v8
-; CHECK-NEXT: vsseg4e8.v v8, (a1)
-; CHECK-NEXT: vl1r.v v8, (a1)
-; CHECK-NEXT: add a1, a4, a5
-; CHECK-NEXT: vl1r.v v9, (a3)
-; CHECK-NEXT: add a3, a3, a2
-; CHECK-NEXT: add a2, a3, a2
-; CHECK-NEXT: vl1r.v v10, (a3)
-; CHECK-NEXT: vl1r.v v11, (a2)
-; CHECK-NEXT: vmsne.vi v9, v9, 0
-; CHECK-NEXT: vmsne.vi v0, v8, 0
-; CHECK-NEXT: vmsne.vi v8, v10, 0
-; CHECK-NEXT: vmsne.vi v10, v11, 0
-; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma
-; CHECK-NEXT: vslideup.vx v0, v9, a5
-; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
-; CHECK-NEXT: vslideup.vx v0, v8, a4
-; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v0, v10, a1
-; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0), v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs4r.v v8, (a0)
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT: vlseg4e8.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t
; CHECK-NEXT: ret
%interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
%vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison)
@@ -640,8 +575,8 @@ define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i
; Negative test - some of the deinterleaved elements might come from the
; passthru not the load
-define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4_passthru(ptr %p, <vscale x 8 x i1> %mask, <vscale x 32 x i8> %passthru) {
-; CHECK-LABEL: masked_loat_factor4_passthru:
+define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_load_factor4_passthru(ptr %p, <vscale x 8 x i1> %mask, <vscale x 32 x i8> %passthru) {
+; CHECK-LABEL: masked_load_factor4_passthru:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index af55aaa..7e7d11e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -303,3 +303,26 @@ define void @vector_interleave_store_factor8(<vscale x 2 x i32> %a, <vscale x 2
store <vscale x 16 x i32> %v, ptr %p
ret void
}
+
+define void @masked_store_factor3(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, ptr %p) {
+; CHECK-LABEL: masked_store_factor3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg3e32.v v8, (a0)
+; CHECK-NEXT: ret
+ %v = call <vscale x 6 x i32> @llvm.vector.interleave3(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c)
+ call void @llvm.masked.store(<vscale x 6 x i32> %v, ptr %p, i32 4, <vscale x 6 x i1> splat (i1 true))
+ ret void
+}
+
+define void @masked_store_factor3_masked(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, ptr %p, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: masked_store_factor3_masked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg3e32.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ %interleaved.mask = call <vscale x 6 x i1> @llvm.vector.interleave3(<vscale x 2 x i1> %m, <vscale x 2 x i1> %m, <vscale x 2 x i1> %m)
+ %v = call <vscale x 6 x i32> @llvm.vector.interleave3(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c)
+ call void @llvm.masked.store(<vscale x 6 x i32> %v, ptr %p, i32 4, <vscale x 6 x i1> %interleaved.mask)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index 4883a4d..dbe0ecc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -1,3159 +1,1907 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvbb,+zvfbfwma -riscv-enable-vl-optimizer=false -verify-machineinstrs | FileCheck %s --check-prefixes=NOVLOPT
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvbb,+zvfbfwma -riscv-enable-vl-optimizer=false -verify-machineinstrs | FileCheck %s --check-prefixes=NOVLOPT
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvbb,+zvfbfwma -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s --check-prefixes=VLOPT
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvbb,+zvfbfwma -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s --check-prefixes=VLOPT
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvbb,+zvfbfwma -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvbb,+zvfbfwma -verify-machineinstrs | FileCheck %s
; The purpose of this file is to check the behavior of specific instructions as it relates to the VL optimizer
define <vscale x 4 x i32> @vadd_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vadd_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vadd_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vi v10, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vadd_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vi v10, v8, 5
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vadd_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vadd_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vadd_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vadd_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vadd_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vadd_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vadd_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vadd_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsub_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsub_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsub.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsub_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsub.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsub_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsub.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsub_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsub_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsub.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsub_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vsub.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsub_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vsub.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vrsub_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vrsub_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vrsub.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vrsub_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vrsub.vi v10, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vrsub_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vrsub.vi v10, v8, 5
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vrsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vrsub_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vrsub_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vrsub.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vrsub_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vrsub.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vrsub_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vrsub.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vrsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vand_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vand_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vand.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vand_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vand.vi v10, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vand_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vand.vi v10, v8, 5
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vand.nxv4i32.i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vand_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vand_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vand.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vand_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vand.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vand_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vand.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vand.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vand_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vand_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vand.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vand_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vand.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vand_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vand.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vand.nxv4i32.i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vor_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vor_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vor.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vor_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vor.vi v10, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vor_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vor.vi v10, v8, 5
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vor.nxv4i32.i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vor_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vor_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vor.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vor_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vor.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vor_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vor.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vor.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vor_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vor_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vor.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vor_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vor.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vor_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vor.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vor.nxv4i32.i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vxor_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vxor_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vxor.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vxor_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vxor.vi v10, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vxor_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vxor.vi v10, v8, 5
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vxor.nxv4i32.i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vxor_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vxor_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vxor.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vxor_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vxor.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vxor_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vxor.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vxor.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vxor_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vxor_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vxor.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vxor_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vxor.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vxor_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vxor.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vxor.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsll_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vsll_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsll.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsll_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsll.vi v10, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsll_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsll.vi v10, v8, 5
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsll.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsll_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsll_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsll.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsll_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsll.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsll_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsll.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsll.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsll_vx(<vscale x 4 x i32> %a, iXLen %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsll_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsll.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsll_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vsll.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsll_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vsll.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsll.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i64> @vwaddu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwaddu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwaddu.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwaddu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vwaddu.vv v12, v8, v10
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwaddu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vwaddu.vv v12, v8, v10
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i32> @vsrl_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vsrl_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsrl.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsrl_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsrl.vi v10, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsrl_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsrl.vi v10, v8, 5
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsrl.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsrl_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsrl_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsrl.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsrl_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsrl.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsrl_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsrl.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsrl.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsrl_vx(<vscale x 4 x i32> %a, iXLen %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsrl_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsrl.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsrl_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vsrl.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsrl_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vsrl.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsrl.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsra_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vsra_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsra.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsra_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsra.vi v10, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsra_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsra.vi v10, v8, 5
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsra.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsra_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsra_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsra.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsra_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsra.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsra_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsra.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsra.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsra_vx(<vscale x 4 x i32> %a, iXLen %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsra_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsra.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsra_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vsra.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsra_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vsra.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsra.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i64> @vwaddu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwaddu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwaddu.vx v12, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwaddu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vwaddu.vx v12, v8, a0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwaddu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vwaddu.vx v12, v8, a0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwsubu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwsubu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwsubu.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwsubu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vwsubu.vv v12, v8, v10
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwsubu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vwsubu.vv v12, v8, v10
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwsubu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwsubu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwsubu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwsubu.vx v12, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwsubu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vwsubu.vx v12, v8, a0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwsubu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vwsubu.vx v12, v8, a0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwsubu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwadd_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwadd_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwadd.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwadd_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vwadd.vv v12, v8, v10
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwadd_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vwadd.vv v12, v8, v10
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwadd_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwadd_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwadd.vx v12, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwadd_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vwadd.vx v12, v8, a0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwadd_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vwadd.vx v12, v8, a0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwsub_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwsub_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwsub.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwsub_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vwsub.vv v12, v8, v10
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwsub_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vwsub.vv v12, v8, v10
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwsub.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwsub_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwsub_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwsub.vx v12, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwsub_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vwsub.vx v12, v8, a0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwsub_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vwsub.vx v12, v8, a0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwsub.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwaddu_wv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwaddu_wv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwaddu.wv v8, v8, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwaddu_wv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vwaddu.wv v8, v8, v12
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwaddu_wv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vwaddu.wv v8, v8, v12
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwaddu.w.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwaddu_wx(<vscale x 4 x i64> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwaddu_wx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwaddu.wx v8, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwaddu_wx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vwaddu.wx v8, v8, a0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwaddu_wx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vwaddu.wx v8, v8, a0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwaddu.w.xv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwsubu_wv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwsubu_wv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwsubu.wv v8, v8, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwsubu_wv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vwsubu.wv v8, v8, v12
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwsubu_wv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vwsubu.wv v8, v8, v12
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwsubu.w.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwsubu_wx(<vscale x 4 x i64> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwsubu_wx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwsubu.wx v8, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwsubu_wx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vwsubu.wx v8, v8, a0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwsubu_wx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vwsubu.wx v8, v8, a0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwsubu.w.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwadd_wv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwadd_wv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwadd.wv v8, v8, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwadd_wv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vwadd.wv v8, v8, v12
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwadd_wv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vwadd.wv v8, v8, v12
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwadd.w.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwadd_wx(<vscale x 4 x i64> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwadd_wx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwadd.wx v8, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwadd_wx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vwadd.wx v8, v8, a0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwadd_wx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vwadd.wx v8, v8, a0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwadd.w.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwsub_wv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwsub_wv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwsub.wv v8, v8, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwsub_wv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vwsub.wv v8, v8, v12
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwsub_wv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vwsub.wv v8, v8, v12
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwsub.w.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwsub_wx(<vscale x 4 x i64> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwsub_wx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwsub.wx v8, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwsub_wx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vwsub.wx v8, v8, a0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwsub_wx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vwsub.wx v8, v8, a0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwsub.w.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i32> @vsext_vf2(<vscale x 4 x i16> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsext_vf2:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsext.vf2 v12, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsext_vf2:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsext.vf2 v12, v8
-; VLOPT-NEXT: vadd.vv v8, v12, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsext_vf2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsext.vf2 v12, v8
+; CHECK-NEXT: vadd.vv v8, v12, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsext.nxv4i32.nxv4i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsext_vf4(<vscale x 4 x i8> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsext_vf4:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsext.vf4 v12, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsext_vf4:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsext.vf4 v12, v8
-; VLOPT-NEXT: vadd.vv v8, v12, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsext_vf4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsext.vf4 v12, v8
+; CHECK-NEXT: vadd.vv v8, v12, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsext.nxv4i32.nxv4i8(<vscale x 4 x i32> poison, <vscale x 4 x i8> %a, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i64> @vsext_vf8(<vscale x 4 x i8> %a, <vscale x 4 x i64> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsext_vf8:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; NOVLOPT-NEXT: vsext.vf8 v16, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v16, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsext_vf8:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; VLOPT-NEXT: vsext.vf8 v16, v8
-; VLOPT-NEXT: vadd.vv v8, v16, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsext_vf8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT: vsext.vf8 v16, v8
+; CHECK-NEXT: vadd.vv v8, v16, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vsext.nxv4i32.nxv4i8(<vscale x 4 x i64> poison, <vscale x 4 x i8> %a, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %b, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i32> @vzext_vf2(<vscale x 4 x i16> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vzext_vf2:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vzext.vf2 v12, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vzext_vf2:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vzext.vf2 v12, v8
-; VLOPT-NEXT: vadd.vv v8, v12, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vzext_vf2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vzext.vf2 v12, v8
+; CHECK-NEXT: vadd.vv v8, v12, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vzext.nxv4i32.nxv4i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vzext_vf4(<vscale x 4 x i8> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vzext_vf4:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vzext.vf4 v12, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vzext_vf4:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vzext.vf4 v12, v8
-; VLOPT-NEXT: vadd.vv v8, v12, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vzext_vf4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vzext.vf4 v12, v8
+; CHECK-NEXT: vadd.vv v8, v12, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vzext.nxv4i32.nxv4i8(<vscale x 4 x i32> poison, <vscale x 4 x i8> %a, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i64> @vzext_vf8(<vscale x 4 x i8> %a, <vscale x 4 x i64> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vzext_vf8:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; NOVLOPT-NEXT: vzext.vf8 v16, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v16, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vzext_vf8:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; VLOPT-NEXT: vzext.vf8 v16, v8
-; VLOPT-NEXT: vadd.vv v8, v16, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vzext_vf8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT: vzext.vf8 v16, v8
+; CHECK-NEXT: vadd.vv v8, v16, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vzext.nxv4i32.nxv4i8(<vscale x 4 x i64> poison, <vscale x 4 x i8> %a, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %b, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i1> @vmadc_vi(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmadc_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmadc.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmadc_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmadc.vi v10, v8, 5
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmadc_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmadc.vi v10, v8, 5
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmadc.nxv4i32.i32(<vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmadc_vx(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmadc_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmadc.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmadc_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmadc.vx v10, v8, a0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmadc_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmadc.vx v10, v8, a0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmadc.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmadc_vv(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, <vscale x 4 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmadc_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmadc.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v12, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmadc_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmadc.vv v12, v8, v10
-; VLOPT-NEXT: vmand.mm v0, v12, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmadc_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmadc.vv v12, v8, v10
+; CHECK-NEXT: vmand.mm v0, v12, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmadc.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmadc_vim(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmadc_vim:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmadc.vim v11, v8, 5, v0
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v11, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmadc_vim:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmadc.vim v11, v8, 5, v0
-; VLOPT-NEXT: vmand.mm v0, v11, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmadc_vim:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmadc.vim v11, v8, 5, v0
+; CHECK-NEXT: vmand.mm v0, v11, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmadc.carry.in.nxv4i32.i32(<vscale x 4 x i32> %a, i32 5, <vscale x 4 x i1> %mask, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmadc_vxm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmadc_vxm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmadc.vxm v11, v8, a0, v0
-; NOVLOPT-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v11, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmadc_vxm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmadc.vxm v11, v8, a0, v0
-; VLOPT-NEXT: vmand.mm v0, v11, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmadc_vxm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmadc.vxm v11, v8, a0, v0
+; CHECK-NEXT: vmand.mm v0, v11, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmadc.carry.in.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, <vscale x 4 x i1> %mask, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmadc_vvm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %b, <vscale x 4 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmadc_vvm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmadc.vvm v11, v8, v12, v0
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v11, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmadc_vvm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmadc.vvm v11, v8, v12, v0
-; VLOPT-NEXT: vmand.mm v0, v11, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmadc_vvm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmadc.vvm v11, v8, v12, v0
+; CHECK-NEXT: vmand.mm v0, v11, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmadc.carry.in.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, <vscale x 4 x i1> %mask, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsbc_vvm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %b, <vscale x 4 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsbc_vvm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsbc.vvm v11, v8, v12, v0
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v11, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsbc_vvm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmsbc.vvm v11, v8, v12, v0
-; VLOPT-NEXT: vmand.mm v0, v11, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsbc_vvm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmsbc.vvm v11, v8, v12, v0
+; CHECK-NEXT: vmand.mm v0, v11, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsbc.borrow.in.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, <vscale x 4 x i1> %mask, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsbc_vxm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsbc_vxm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsbc.vxm v11, v8, a0, v0
-; NOVLOPT-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v11, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsbc_vxm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmsbc.vxm v11, v8, a0, v0
-; VLOPT-NEXT: vmand.mm v0, v11, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsbc_vxm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmsbc.vxm v11, v8, a0, v0
+; CHECK-NEXT: vmand.mm v0, v11, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsbc.borrow.in.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, <vscale x 4 x i1> %mask, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsbc_vx(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsbc_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsbc.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsbc_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmsbc.vx v10, v8, a0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsbc_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmsbc.vx v10, v8, a0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsbc.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsbc_vv(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, <vscale x 4 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsbc_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsbc.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v12, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsbc_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmsbc.vv v12, v8, v10
-; VLOPT-NEXT: vmand.mm v0, v12, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsbc_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmsbc.vv v12, v8, v10
+; CHECK-NEXT: vmand.mm v0, v12, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsbc.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i16> @vnsrl_wi(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vnsrl_wi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vnsrl.wi v11, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v11, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnsrl_wi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; VLOPT-NEXT: vnsrl.wi v11, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v11, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnsrl_wi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vnsrl.wi v11, v8, 5
+; CHECK-NEXT: vadd.vv v8, v11, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i16> @llvm.riscv.vnsrl.nxv4i16.nxv4i32(<vscale x 4 x i16> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen -1)
%2 = call <vscale x 4 x i16> @llvm.riscv.vadd.nxv4i16.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i16> %1, <vscale x 4 x i16> %b, iXLen %vl)
ret <vscale x 4 x i16> %2
}
define <vscale x 4 x i16> @vnsrl_wx(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, iXLen %c, iXLen %vl) {
-; NOVLOPT-LABEL: vnsrl_wx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vnsrl.wx v11, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v11, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnsrl_wx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; VLOPT-NEXT: vnsrl.wx v11, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v11, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnsrl_wx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vnsrl.wx v11, v8, a0
+; CHECK-NEXT: vadd.vv v8, v11, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i16> @llvm.riscv.vnsrl.nxv4i16.nxv4i32(<vscale x 4 x i16> poison, <vscale x 4 x i32> %a, iXLen %c, iXLen -1)
%2 = call <vscale x 4 x i16> @llvm.riscv.vadd.nxv4i16.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i16> %1, <vscale x 4 x i16> %b, iXLen %vl)
ret <vscale x 4 x i16> %2
}
define <vscale x 4 x i16> @vnsrl_wv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vnsrl_wv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vnsrl.wv v12, v8, v11
-; NOVLOPT-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnsrl_wv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; VLOPT-NEXT: vnsrl.wv v12, v8, v11
-; VLOPT-NEXT: vadd.vv v8, v12, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnsrl_wv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vnsrl.wv v12, v8, v11
+; CHECK-NEXT: vadd.vv v8, v12, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i16> @llvm.riscv.vnsrl.nxv4i16.nxv4i32.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i32> %a, <vscale x 4 x i16> %c, iXLen -1)
%2 = call <vscale x 4 x i16> @llvm.riscv.vadd.nxv4i16.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i16> %1, <vscale x 4 x i16> %b, iXLen %vl)
ret <vscale x 4 x i16> %2
}
define <vscale x 4 x i16> @vnsra_wi(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vnsra_wi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vnsra.wi v11, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v11, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnsra_wi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; VLOPT-NEXT: vnsra.wi v11, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v11, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnsra_wi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vnsra.wi v11, v8, 5
+; CHECK-NEXT: vadd.vv v8, v11, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i16> @llvm.riscv.vnsra.nxv4i16.nxv4i32(<vscale x 4 x i16> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen -1)
%2 = call <vscale x 4 x i16> @llvm.riscv.vadd.nxv4i16.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i16> %1, <vscale x 4 x i16> %b, iXLen %vl)
ret <vscale x 4 x i16> %2
}
define <vscale x 4 x i16> @vnsra_wx(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, iXLen %c, iXLen %vl) {
-; NOVLOPT-LABEL: vnsra_wx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vnsra.wx v11, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v11, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnsra_wx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; VLOPT-NEXT: vnsra.wx v11, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v11, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnsra_wx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vnsra.wx v11, v8, a0
+; CHECK-NEXT: vadd.vv v8, v11, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i16> @llvm.riscv.vnsra.nxv4i16.nxv4i32(<vscale x 4 x i16> poison, <vscale x 4 x i32> %a, iXLen %c, iXLen -1)
%2 = call <vscale x 4 x i16> @llvm.riscv.vadd.nxv4i16.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i16> %1, <vscale x 4 x i16> %b, iXLen %vl)
ret <vscale x 4 x i16> %2
}
define <vscale x 4 x i16> @vnsra_wv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vnsra_wv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vnsra.wv v12, v8, v11
-; NOVLOPT-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnsra_wv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; VLOPT-NEXT: vnsra.wv v12, v8, v11
-; VLOPT-NEXT: vadd.vv v8, v12, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnsra_wv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vnsra.wv v12, v8, v11
+; CHECK-NEXT: vadd.vv v8, v12, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i16> @llvm.riscv.vnsra.nxv4i16.nxv4i32.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i32> %a, <vscale x 4 x i16> %c, iXLen -1)
%2 = call <vscale x 4 x i16> @llvm.riscv.vadd.nxv4i16.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i16> %1, <vscale x 4 x i16> %b, iXLen %vl)
ret <vscale x 4 x i16> %2
}
define <vscale x 4 x i1> @vmseq_vi(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmseq_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmseq.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmseq_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmseq.vi v10, v8, 5
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmseq_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmseq.vi v10, v8, 5
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmseq.nxv4i32.i32(<vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmseq_vx(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmseq_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmseq.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmseq_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmseq.vx v10, v8, a0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmseq_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmseq.vx v10, v8, a0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmseq.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmseq_vv(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, <vscale x 4 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmseq_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmseq.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v12, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmseq_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmseq.vv v12, v8, v10
-; VLOPT-NEXT: vmand.mm v0, v12, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmseq_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmseq.vv v12, v8, v10
+; CHECK-NEXT: vmand.mm v0, v12, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmseq.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsne_vi(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmsne_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsne.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsne_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmsne.vi v10, v8, 5
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsne_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmsne.vi v10, v8, 5
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsne.nxv4i32.i32(<vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsne_vx(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsne_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsne.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsne_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmsne.vx v10, v8, a0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsne_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmsne.vx v10, v8, a0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsne.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsne_vv(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, <vscale x 4 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsne_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsne.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v12, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsne_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmsne.vv v12, v8, v10
-; VLOPT-NEXT: vmand.mm v0, v12, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsne_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmsne.vv v12, v8, v10
+; CHECK-NEXT: vmand.mm v0, v12, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsne.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsltu_vx(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsltu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsltu.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsltu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmsltu.vx v10, v8, a0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsltu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmsltu.vx v10, v8, a0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsltu.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsltu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, <vscale x 4 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsltu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsltu.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v12, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsltu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmsltu.vv v12, v8, v10
-; VLOPT-NEXT: vmand.mm v0, v12, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsltu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmsltu.vv v12, v8, v10
+; CHECK-NEXT: vmand.mm v0, v12, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsltu.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmslt_vx(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmslt_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmslt.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmslt_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmslt.vx v10, v8, a0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmslt_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmslt.vx v10, v8, a0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmslt.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmslt_vv(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, <vscale x 4 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmslt_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmslt.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v12, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmslt_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmslt.vv v12, v8, v10
-; VLOPT-NEXT: vmand.mm v0, v12, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmslt_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmslt.vv v12, v8, v10
+; CHECK-NEXT: vmand.mm v0, v12, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmslt.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsleu_vi(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmsleu_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsleu.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsleu_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmsleu.vi v10, v8, 5
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsleu_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmsleu.vi v10, v8, 5
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsleu.nxv4i32.i32(<vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsleu_vx(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsleu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsleu.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsleu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmsleu.vx v10, v8, a0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsleu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmsleu.vx v10, v8, a0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsleu.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsleu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, <vscale x 4 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsleu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsleu.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v12, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsleu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmsleu.vv v12, v8, v10
-; VLOPT-NEXT: vmand.mm v0, v12, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsleu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmsleu.vv v12, v8, v10
+; CHECK-NEXT: vmand.mm v0, v12, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsleu.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsle_vi(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmsle_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsle.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsle_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmsle.vi v10, v8, 5
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsle_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmsle.vi v10, v8, 5
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsle.nxv4i32.i32(<vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsle_vx(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsle_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsle.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsle_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmsle.vx v10, v8, a0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsle_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmsle.vx v10, v8, a0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsle.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsle_vv(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, <vscale x 4 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsle_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsle.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v12, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsle_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmsle.vv v12, v8, v10
-; VLOPT-NEXT: vmand.mm v0, v12, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsle_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmsle.vv v12, v8, v10
+; CHECK-NEXT: vmand.mm v0, v12, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsle.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsgtu_vi(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmsgtu_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsgtu.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsgtu_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmsgtu.vi v10, v8, 5
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsgtu_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmsgtu.vi v10, v8, 5
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsgtu.nxv4i32.i32(<vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsgtu_vx(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsgtu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsgtu.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsgtu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmsgtu.vx v10, v8, a0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsgtu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmsgtu.vx v10, v8, a0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsgtu.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsgt_vi(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmsgt_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsgt.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsgt_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmsgt.vi v10, v8, 5
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsgt_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmsgt.vi v10, v8, 5
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsgt.nxv4i32.i32(<vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmsgt_vx(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsgt_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmsgt.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsgt_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmsgt.vx v10, v8, a0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsgt_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmsgt.vx v10, v8, a0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmsgt.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i32> @vminu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vminu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vminu.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vminu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vminu.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vminu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vminu.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vminu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vminu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vminu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vminu.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vminu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vminu.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vminu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vminu.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vminu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmin_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmin_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmin.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmin_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmin.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmin_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmin.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmin.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmin_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmin_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmin.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmin_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmin.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmin_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmin.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmin.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmaxu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmaxu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmaxu.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmaxu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmaxu.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmaxu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmaxu.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmaxu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmaxu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmaxu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmaxu.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmaxu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmaxu.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmaxu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmaxu.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmaxu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmax_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmax_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmax.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmax_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmax.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmax_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmax.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmax.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmax_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmax_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmax.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmax_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmax.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmax_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmax.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmax.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmul_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmul_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmul.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmul_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmul.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmul_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmul.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmul_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmul_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmul.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmul_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmul.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmul_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmul.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmulh_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmulh_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmulh.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmulh_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmulh.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmulh_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmulh.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmulh.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmulh_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmulh_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmulh.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmulh_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmulh.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmulh_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmulh.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmulh.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmulhu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmulhu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmulhu.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmulhu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmulhu.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmulhu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmulhu.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmulhu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmulhu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmulhu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmulhu.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmulhu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmulhu.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmulhu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmulhu.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmulhu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmulhsu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmulhsu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmulhsu.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmulhsu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmulhsu.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmulhsu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmulhsu.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmulhsu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmulhsu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmulhsu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmulhsu.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmulhsu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmulhsu.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmulhsu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmulhsu.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmulhsu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vdivu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vdivu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vdivu.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vdivu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vdivu.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vdivu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vdivu.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vdivu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vdivu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vdivu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vdivu.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vdivu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vdivu.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vdivu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vdivu.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vdivu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vdiv_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vdiv_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vdiv.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vdiv_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vdiv.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vdiv_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vdiv.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vdiv.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vdiv_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vdiv_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vdiv.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vdiv_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vdiv.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vdiv_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vdiv.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vdiv.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vremu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vremu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vremu.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vremu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vremu.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vremu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vremu.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vremu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vremu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vremu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vremu.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vremu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vremu.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vremu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vremu.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vremu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vrem_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vrem_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vrem.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vrem_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vrem.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vrem_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vrem.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vrem.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vrem_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vrem_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vrem.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vrem_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vrem.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vrem_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vrem.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vrem.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i64> @vwmul_vv(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwmul_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vwmul.vv v12, v8, v9
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwmul.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwmul_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; VLOPT-NEXT: vwmul.vv v12, v8, v9
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vwmul.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwmul_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vwmul.vv v12, v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vwmul.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmul.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vwmul.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwmul_vx(<vscale x 4 x i16> %a, i16 %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vwmul_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a3, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vwmul.vx v12, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a2, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwmul.vx v8, v12, a1
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwmul_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a2, e16, m1, ta, ma
-; VLOPT-NEXT: vwmul.vx v12, v8, a0
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vwmul.vx v8, v12, a1
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwmul_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vwmul.vx v12, v8, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vwmul.vx v8, v12, a1
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmul.nxv4i32.nxv4i16.i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, i16 %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vwmul.nxv4i64.nxv4i64.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %1, i32 %c, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwmulsu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwmulsu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwmulsu.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwmulsu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vwmulsu.vv v12, v8, v10
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwmulsu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vwmulsu.vv v12, v8, v10
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwmulsu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwmulsu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwmulsu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwmulsu.vx v12, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwmulsu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vwmulsu.vx v12, v8, a0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwmulsu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vwmulsu.vx v12, v8, a0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwmulsu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwmulu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwmulu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwmulu.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwmulu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vwmulu.vv v12, v8, v10
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwmulu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vwmulu.vv v12, v8, v10
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwmulu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i64> @vwmulu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwmulu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vwmulu.vx v12, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwmulu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vwmulu.vx v12, v8, a0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwmulu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vwmulu.vx v12, v8, a0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i64> @llvm.riscv.vwmulu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
ret <vscale x 4 x i64> %2
}
define <vscale x 4 x i32> @vwmacc_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, <vscale x 4 x i32> %d, iXLen %vl) {
-; NOVLOPT-LABEL: vwmacc_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
-; NOVLOPT-NEXT: vwmacc.vv v8, v10, v11
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwmacc_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
-; VLOPT-NEXT: vwmacc.vv v8, v10, v11
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwmacc_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vwmacc.vv v8, v10, v11
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %d, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmacc_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmacc_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vmacc.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmacc_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
-; VLOPT-NEXT: vmacc.vv v8, v8, v10
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmacc_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: vmacc.vv v8, v8, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmacc_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmacc_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vmv2r.v v10, v8
-; NOVLOPT-NEXT: vmacc.vx v10, a0, v8
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmacc_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, tu, ma
-; VLOPT-NEXT: vmv2r.v v10, v8
-; VLOPT-NEXT: vmacc.vx v10, a0, v8
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmacc_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma
+; CHECK-NEXT: vmv2r.v v10, v8
+; CHECK-NEXT: vmacc.vx v10, a0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %b, <vscale x 4 x i32> %a, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmadd_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmadd_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vmadd.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmadd_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
-; VLOPT-NEXT: vmadd.vv v8, v8, v10
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmadd_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: vmadd.vv v8, v8, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmadd_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vmadd_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vmv2r.v v10, v8
-; NOVLOPT-NEXT: vmadd.vx v10, a0, v8
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmadd_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, tu, ma
-; VLOPT-NEXT: vmv2r.v v10, v8
-; VLOPT-NEXT: vmadd.vx v10, a0, v8
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmadd_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma
+; CHECK-NEXT: vmv2r.v v10, v8
+; CHECK-NEXT: vmadd.vx v10, a0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, i32 %b, <vscale x 4 x i32> %a, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vnmsac_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vnmsac_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vnmsac.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnmsac_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
-; VLOPT-NEXT: vnmsac.vv v8, v8, v10
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnmsac_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: vnmsac.vv v8, v8, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vnmsac_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vnmsac_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vmv2r.v v10, v8
-; NOVLOPT-NEXT: vnmsac.vx v10, a0, v8
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnmsac_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, tu, ma
-; VLOPT-NEXT: vmv2r.v v10, v8
-; VLOPT-NEXT: vnmsac.vx v10, a0, v8
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnmsac_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma
+; CHECK-NEXT: vmv2r.v v10, v8
+; CHECK-NEXT: vnmsac.vx v10, a0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, i32 %b, <vscale x 4 x i32> %a, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vnmsub_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vnmsub_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vnmsub.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnmsub_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
-; VLOPT-NEXT: vnmsub.vv v8, v8, v10
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnmsub_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: vnmsub.vv v8, v8, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vnmsub_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vnmsub_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vmv2r.v v10, v8
-; NOVLOPT-NEXT: vnmsub.vx v10, a0, v8
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnmsub_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, tu, ma
-; VLOPT-NEXT: vmv2r.v v10, v8
-; VLOPT-NEXT: vnmsub.vx v10, a0, v8
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnmsub_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma
+; CHECK-NEXT: vmv2r.v v10, v8
+; CHECK-NEXT: vnmsub.vx v10, a0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, i32 %b, <vscale x 4 x i32> %a, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vwmacc_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
-; NOVLOPT-NEXT: vwmacc.vx v8, a0, v10
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwmacc_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
-; VLOPT-NEXT: vwmacc.vx v8, a0, v10
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwmacc_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT: vwmacc.vx v8, a0, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vwmaccu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, <vscale x 4 x i32> %d, iXLen %vl) {
-; NOVLOPT-LABEL: vwmaccu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
-; NOVLOPT-NEXT: vwmaccu.vv v8, v10, v11
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwmaccu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
-; VLOPT-NEXT: vwmaccu.vv v8, v10, v11
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwmaccu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vwmaccu.vv v8, v10, v11
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %d, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vwmaccu_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, <vscale x 4 x i32> %d, i32 %e, iXLen %vl) {
-; NOVLOPT-LABEL: vwmaccu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
-; NOVLOPT-NEXT: vwmaccu.vx v8, a0, v10
-; NOVLOPT-NEXT: vsetvli zero, a2, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwmaccu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a2, e16, m1, tu, ma
-; VLOPT-NEXT: vwmaccu.vx v8, a0, v10
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwmaccu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, tu, ma
+; CHECK-NEXT: vwmaccu.vx v8, a0, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %d, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vwmaccsu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vwmaccsu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
-; NOVLOPT-NEXT: vwmaccsu.vv v8, v10, v11
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwmaccsu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
-; VLOPT-NEXT: vwmaccsu.vv v8, v10, v11
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwmaccsu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vwmaccsu.vv v8, v10, v11
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vwmaccsu_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vwmaccsu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
-; NOVLOPT-NEXT: vwmaccsu.vx v8, a0, v10
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwmaccsu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
-; VLOPT-NEXT: vwmaccsu.vx v8, a0, v10
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwmaccsu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT: vwmaccsu.vx v8, a0, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vwmaccus_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vwmaccus_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
-; NOVLOPT-NEXT: vwmaccus.vx v8, a0, v10
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwmaccus_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
-; VLOPT-NEXT: vwmaccus.vx v8, a0, v10
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwmaccus_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT: vwmaccus.vx v8, a0, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccus.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsaddu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsaddu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsaddu.vv v10, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsaddu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsaddu.vv v10, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsaddu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsaddu.vv v10, v8, v10
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsaddu(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsaddu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsaddu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsaddu.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsaddu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vsaddu.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsaddu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vsaddu.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsaddu(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsaddu_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vsaddu_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsaddu.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsaddu_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsaddu.vi v10, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsaddu_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsaddu.vi v10, v8, 5
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsaddu(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsadd_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsadd_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsadd.vv v10, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsadd_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsadd.vv v10, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsadd_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsadd.vv v10, v8, v10
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsadd_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsadd_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsadd.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsadd_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vsadd.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsadd_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vsadd.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsadd_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vsadd_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsadd.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsadd_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsadd.vi v10, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsadd_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsadd.vi v10, v8, 5
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 5, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vssubu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vssubu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vssubu.vv v10, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vssubu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vssubu.vv v10, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vssubu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vssubu.vv v10, v8, v10
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vssubu(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vssubu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vssubu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vssubu.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vssubu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vssubu.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vssubu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vssubu.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vssubu(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vssub_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vssub_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vssub.vv v10, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vssub_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vssub.vv v10, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vssub_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vssub.vv v10, v8, v10
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vssub(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vssub_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vssub_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vssub.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vssub_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vssub.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vssub_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vssub.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vssub(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsmul_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsmul_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsmul.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsmul_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsmul.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsmul_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsmul.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsmul_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vsmul_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsmul.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsmul_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vsmul.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsmul_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vsmul.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vssrl_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vssrl_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vssrl.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vssrl_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vssrl.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vssrl_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vssrl.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vssrl.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vssrl_vx(<vscale x 4 x i32> %a, iXLen %b, iXLen %vl) {
-; NOVLOPT-LABEL: vssrl_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vssrl.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vssrl_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vssrl.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vssrl_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vssrl.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vssrl.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vssrl_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vssrl_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vssrl.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vssrl_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vssrl.vi v10, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vssrl_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vssrl.vi v10, v8, 5
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vssrl.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vssra_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vssra_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vssra.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vssra_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vssra.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vssra_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vssra.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vssra.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vssra_vx(<vscale x 4 x i32> %a, iXLen %b, iXLen %vl) {
-; NOVLOPT-LABEL: vssra_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vssra.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vssra_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vssra.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vssra_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vssra.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vssra.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vssra_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vssra_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vssra.vi v10, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vssra_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vssra.vi v10, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vssra_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vssra.vi v10, v8, 5
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vssra.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vnclipu_vv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vnclipu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vnclipu.wv v14, v8, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v14, v14
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnclipu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vnclipu.wv v14, v8, v12
-; VLOPT-NEXT: vadd.vv v8, v14, v14
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnclipu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vnclipu.wv v14, v8, v12
+; CHECK-NEXT: vadd.vv v8, v14, v14
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vnclipu(<vscale x 4 x i32> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vnclipu_vx(<vscale x 4 x i64> %a, iXLen %b, iXLen %vl) {
-; NOVLOPT-LABEL: vnclipu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vnclipu.wx v12, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnclipu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vnclipu.wx v12, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnclipu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vnclipu.wx v12, v8, a0
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vnclipu(<vscale x 4 x i32> poison, <vscale x 4 x i64> %a, iXLen %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vnclipu_vi(<vscale x 4 x i64> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vnclipu_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vnclipu.wi v12, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnclipu_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vnclipu.wi v12, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnclipu_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vnclipu.wi v12, v8, 5
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vnclipu(<vscale x 4 x i32> poison, <vscale x 4 x i64> %a, iXLen 5, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vnclip_vv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vnclip_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vnclip.wv v14, v8, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v14, v14
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnclip_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vnclip.wv v14, v8, v12
-; VLOPT-NEXT: vadd.vv v8, v14, v14
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnclip_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vnclip.wv v14, v8, v12
+; CHECK-NEXT: vadd.vv v8, v14, v14
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vnclip(<vscale x 4 x i32> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vnclip_vx(<vscale x 4 x i64> %a, iXLen %b, iXLen %vl) {
-; NOVLOPT-LABEL: vnclip_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vnclip.wx v12, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnclip_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vnclip.wx v12, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnclip_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vnclip.wx v12, v8, a0
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vnclip(<vscale x 4 x i32> poison, <vscale x 4 x i64> %a, iXLen %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vnclip_vi(<vscale x 4 x i64> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vnclip_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vnclip.wi v12, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnclip_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vnclip.wi v12, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnclip_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vnclip.wi v12, v8, 5
+; CHECK-NEXT: vadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vnclip(<vscale x 4 x i32> poison, <vscale x 4 x i64> %a, iXLen 5, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmv_v_i(<vscale x 4 x i32> %a, i32 %x, iXLen %vl) {
-; NOVLOPT-LABEL: vmv_v_i:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a0, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmv.v.i v10, 5
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmv_v_i:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmv.v.i v10, 5
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmv_v_i:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 5
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(<vscale x 4 x i32> poison, i32 5, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmv_v_x(<vscale x 4 x i32> %a, i32 %x, iXLen %vl) {
-; NOVLOPT-LABEL: vmv_v_x:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmv.v.x v10, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmv_v_x:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmv.v.x v10, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmv_v_x:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(<vscale x 4 x i32> poison, i32 %x, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
@@ -3161,110 +1909,67 @@ define <vscale x 4 x i32> @vmv_v_x(<vscale x 4 x i32> %a, i32 %x, iXLen %vl) {
; The vmv.v.v is optimized away if we use a vadd as the user.
define <vscale x 1 x i8> @vmv_v_v(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, iXLen %vl) {
-; NOVLOPT-LABEL: vmv_v_v:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, tu, ma
-; NOVLOPT-NEXT: vmv.v.v v8, v9
-; NOVLOPT-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmerge.vvm v8, v8, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmv_v_v:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, tu, ma
-; VLOPT-NEXT: vmv.v.v v8, v9
-; VLOPT-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
-; VLOPT-NEXT: vmerge.vvm v8, v8, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmv_v_v:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, ma
+; CHECK-NEXT: vmv.v.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: ret
%2 = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, iXLen -1)
%3 = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %2, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, iXLen %vl)
ret <vscale x 1 x i8> %3
}
define <vscale x 4 x i32> @vwsll_vi(<vscale x 4 x i16> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vwsll_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vwsll.vi v10, v8, 1
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwsll_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; VLOPT-NEXT: vwsll.vi v10, v8, 1
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v10, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwsll_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vwsll.vi v10, v8, 1
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v10, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vwsll.nxv4i32.nxv4i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, iXLen 1, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vwsll_vx(<vscale x 4 x i16> %a, iXLen %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwsll_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vwsll.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwsll_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; VLOPT-NEXT: vwsll.vx v10, v8, a0
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v10, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwsll_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vwsll.vx v10, v8, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v10, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vwsll.nxv4i32.nxv4i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, iXLen %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vwsll_vv(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vwsll_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vwsll.vv v10, v8, v9
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vwsll_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; VLOPT-NEXT: vwsll.vv v10, v8, v9
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v10, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vwsll_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vwsll.vv v10, v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v10, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vwsll.nxv4i32.nxv4i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 1 x i32> @vmand_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmand_mm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmand.mm v8, v0, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v0, v8
-; NOVLOPT-NEXT: vmv1r.v v8, v9
-; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmand_mm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; VLOPT-NEXT: vmand.mm v8, v0, v8
-; VLOPT-NEXT: vmand.mm v0, v0, v8
-; VLOPT-NEXT: vmv1r.v v8, v9
-; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmand_mm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: vmand.mm v8, v0, v8
+; CHECK-NEXT: vmand.mm v0, v0, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
+; CHECK-NEXT: vadd.vv v8, v9, v9, v0.t
+; CHECK-NEXT: ret
%1 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
%2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
%3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
@@ -3272,26 +1977,15 @@ define <vscale x 1 x i32> @vmand_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
}
define <vscale x 1 x i32> @vmnand_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmnand_mm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmnand.mm v8, v0, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v0, v8
-; NOVLOPT-NEXT: vmv1r.v v8, v9
-; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmnand_mm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; VLOPT-NEXT: vmnand.mm v8, v0, v8
-; VLOPT-NEXT: vmand.mm v0, v0, v8
-; VLOPT-NEXT: vmv1r.v v8, v9
-; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmnand_mm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: vmnand.mm v8, v0, v8
+; CHECK-NEXT: vmand.mm v0, v0, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
+; CHECK-NEXT: vadd.vv v8, v9, v9, v0.t
+; CHECK-NEXT: ret
%1 = call <vscale x 1 x i1> @llvm.riscv.vmnand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
%2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
%3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
@@ -3299,26 +1993,15 @@ define <vscale x 1 x i32> @vmnand_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
}
define <vscale x 1 x i32> @vmandn_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmandn_mm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmandn.mm v8, v0, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v0, v8
-; NOVLOPT-NEXT: vmv1r.v v8, v9
-; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmandn_mm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; VLOPT-NEXT: vmandn.mm v8, v0, v8
-; VLOPT-NEXT: vmand.mm v0, v0, v8
-; VLOPT-NEXT: vmv1r.v v8, v9
-; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmandn_mm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: vmandn.mm v8, v0, v8
+; CHECK-NEXT: vmand.mm v0, v0, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
+; CHECK-NEXT: vadd.vv v8, v9, v9, v0.t
+; CHECK-NEXT: ret
%1 = call <vscale x 1 x i1> @llvm.riscv.vmandn.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
%2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
%3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
@@ -3326,26 +2009,15 @@ define <vscale x 1 x i32> @vmandn_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
}
define <vscale x 1 x i32> @vmxor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmxor_mm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmxor.mm v8, v0, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v0, v8
-; NOVLOPT-NEXT: vmv1r.v v8, v9
-; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmxor_mm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; VLOPT-NEXT: vmxor.mm v8, v0, v8
-; VLOPT-NEXT: vmand.mm v0, v0, v8
-; VLOPT-NEXT: vmv1r.v v8, v9
-; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmxor_mm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: vmxor.mm v8, v0, v8
+; CHECK-NEXT: vmand.mm v0, v0, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
+; CHECK-NEXT: vadd.vv v8, v9, v9, v0.t
+; CHECK-NEXT: ret
%1 = call <vscale x 1 x i1> @llvm.riscv.vmxor.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
%2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
%3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
@@ -3353,26 +2025,15 @@ define <vscale x 1 x i32> @vmxor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
}
define <vscale x 1 x i32> @vmor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmor_mm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmor.mm v8, v0, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v0, v8
-; NOVLOPT-NEXT: vmv1r.v v8, v9
-; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmor_mm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; VLOPT-NEXT: vmor.mm v8, v0, v8
-; VLOPT-NEXT: vmand.mm v0, v0, v8
-; VLOPT-NEXT: vmv1r.v v8, v9
-; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmor_mm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: vmor.mm v8, v0, v8
+; CHECK-NEXT: vmand.mm v0, v0, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
+; CHECK-NEXT: vadd.vv v8, v9, v9, v0.t
+; CHECK-NEXT: ret
%1 = call <vscale x 1 x i1> @llvm.riscv.vmor.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
%2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
%3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
@@ -3381,26 +2042,15 @@ define <vscale x 1 x i32> @vmor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <
define <vscale x 1 x i32> @vmnor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmnor_mm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmnor.mm v8, v0, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v0, v8
-; NOVLOPT-NEXT: vmv1r.v v8, v9
-; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmnor_mm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; VLOPT-NEXT: vmnor.mm v8, v0, v8
-; VLOPT-NEXT: vmand.mm v0, v0, v8
-; VLOPT-NEXT: vmv1r.v v8, v9
-; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmnor_mm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: vmnor.mm v8, v0, v8
+; CHECK-NEXT: vmand.mm v0, v0, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
+; CHECK-NEXT: vadd.vv v8, v9, v9, v0.t
+; CHECK-NEXT: ret
%1 = call <vscale x 1 x i1> @llvm.riscv.vmnor.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
%2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
%3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
@@ -3408,26 +2058,15 @@ define <vscale x 1 x i32> @vmnor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
}
define <vscale x 1 x i32> @vmorn_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmorn_mm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmorn.mm v8, v0, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v0, v8
-; NOVLOPT-NEXT: vmv1r.v v8, v9
-; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmorn_mm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; VLOPT-NEXT: vmorn.mm v8, v0, v8
-; VLOPT-NEXT: vmand.mm v0, v0, v8
-; VLOPT-NEXT: vmv1r.v v8, v9
-; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmorn_mm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: vmorn.mm v8, v0, v8
+; CHECK-NEXT: vmand.mm v0, v0, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
+; CHECK-NEXT: vadd.vv v8, v9, v9, v0.t
+; CHECK-NEXT: ret
%1 = call <vscale x 1 x i1> @llvm.riscv.vmorn.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
%2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
%3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
@@ -3435,26 +2074,15 @@ define <vscale x 1 x i32> @vmorn_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
}
define <vscale x 1 x i32> @vmxnor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmxnor_mm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmxnor.mm v8, v0, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v0, v8
-; NOVLOPT-NEXT: vmv1r.v v8, v9
-; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmxnor_mm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; VLOPT-NEXT: vmxnor.mm v8, v0, v8
-; VLOPT-NEXT: vmand.mm v0, v0, v8
-; VLOPT-NEXT: vmv1r.v v8, v9
-; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmxnor_mm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: vmxnor.mm v8, v0, v8
+; CHECK-NEXT: vmand.mm v0, v0, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
+; CHECK-NEXT: vadd.vv v8, v9, v9, v0.t
+; CHECK-NEXT: ret
%1 = call <vscale x 1 x i1> @llvm.riscv.vmxnor.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
%2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
%3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
@@ -3462,24 +2090,14 @@ define <vscale x 1 x i32> @vmxnor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
}
define <vscale x 1 x i32> @vmsbf_m(<vscale x 1 x i1> %a, <vscale x 1 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsbf_m:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmsbf.m v9, v0
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v0, v9
-; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8, v0.t
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsbf_m:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; VLOPT-NEXT: vmsbf.m v9, v0
-; VLOPT-NEXT: vmand.mm v0, v0, v9
-; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT: vadd.vv v8, v8, v8, v0.t
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsbf_m:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: vmsbf.m v9, v0
+; CHECK-NEXT: vmand.mm v0, v0, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
+; CHECK-NEXT: vadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT: ret
%1 = call <vscale x 1 x i1> @llvm.riscv.vmsbf.nxv1i1(<vscale x 1 x i1> %a, iXLen -1)
%2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
%3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
@@ -3487,24 +2105,14 @@ define <vscale x 1 x i32> @vmsbf_m(<vscale x 1 x i1> %a, <vscale x 1 x i32> %c,
}
define <vscale x 1 x i32> @vmsif_m(<vscale x 1 x i1> %a, <vscale x 1 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsif_m:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmsif.m v9, v0
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v0, v9
-; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8, v0.t
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsif_m:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; VLOPT-NEXT: vmsif.m v9, v0
-; VLOPT-NEXT: vmand.mm v0, v0, v9
-; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT: vadd.vv v8, v8, v8, v0.t
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsif_m:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: vmsif.m v9, v0
+; CHECK-NEXT: vmand.mm v0, v0, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
+; CHECK-NEXT: vadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT: ret
%1 = call <vscale x 1 x i1> @llvm.riscv.vmsif.nxv1i1(<vscale x 1 x i1> %a, iXLen -1)
%2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
%3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
@@ -3512,24 +2120,14 @@ define <vscale x 1 x i32> @vmsif_m(<vscale x 1 x i1> %a, <vscale x 1 x i32> %c,
}
define <vscale x 1 x i32> @vmsof_m(<vscale x 1 x i1> %a, <vscale x 1 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmsof_m:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmsof.m v9, v0
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v0, v9
-; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8, v0.t
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmsof_m:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
-; VLOPT-NEXT: vmsof.m v9, v0
-; VLOPT-NEXT: vmand.mm v0, v0, v9
-; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT: vadd.vv v8, v8, v8, v0.t
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmsof_m:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: vmsof.m v9, v0
+; CHECK-NEXT: vmand.mm v0, v0, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, mu
+; CHECK-NEXT: vadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT: ret
%1 = call <vscale x 1 x i1> @llvm.riscv.vmsof.nxv1i1(<vscale x 1 x i1> %a, iXLen -1)
%2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
%3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
@@ -3537,160 +2135,96 @@ define <vscale x 1 x i32> @vmsof_m(<vscale x 1 x i1> %a, <vscale x 1 x i32> %c,
}
define <vscale x 4 x i32> @viota_m(<vscale x 4 x i1> %a, <vscale x 4 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: viota_m:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: viota.m v10, v0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: viota_m:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: viota.m v10, v0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: viota_m:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: viota.m v10, v0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.viota.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %a, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %c, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vid.v(<vscale x 4 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vid.v:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vid.v v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vid.v:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vid.v v10
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vid.v:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vid.v v10
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vid.nxv4i32(<vscale x 4 x i32> poison, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %c, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vslideup_vx(<vscale x 4 x i32> %a, iXLen %b, iXLen %vl) {
-; NOVLOPT-LABEL: vslideup_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vslideup.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vslideup_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vslideup.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vslideup_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vslideup(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen %b, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vslideup_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vslideup_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vslideup.vi v10, v8, 2
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vslideup_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vslideup.vi v10, v8, 2
-; VLOPT-NEXT: vadd.vv v8, v10, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vslideup_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v8, 2
+; CHECK-NEXT: vadd.vv v8, v10, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vslideup(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 2, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vslidedown_vx(<vscale x 4 x i32> %a, iXLen %b, iXLen %vl) {
-; NOVLOPT-LABEL: vslidedown_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vslidedown.vx v8, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vslidedown_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vslidedown.vx v8, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vslidedown_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vslidedown(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen %b, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vslidedown_vi(<vscale x 4 x i32> %a, iXLen %vl) {
-; NOVLOPT-LABEL: vslidedown_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vslidedown.vi v8, v8, 2
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vslidedown_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vslidedown.vi v8, v8, 2
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vslidedown_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v8, 2
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vslidedown(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 2, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vslide1up_vx(<vscale x 4 x i32> %a, iXLen %b, iXLen %vl) {
-; NOVLOPT-LABEL: vslide1up_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vslide1up.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vslide1up_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vslide1up.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vslide1up_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vslide1up.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vslide1up(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x float> @vfslide1up_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfslide1up_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfslide1up.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfslide1up_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfslide1up.vf v10, v8, fa0
-; VLOPT-NEXT: vfadd.vv v8, v10, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfslide1up_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfslide1up.vf v10, v8, fa0
+; CHECK-NEXT: vfadd.vv v8, v10, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfslide1up(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %1, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
@@ -3699,21 +2233,13 @@ define <vscale x 4 x float> @vfslide1up_vf(<vscale x 4 x float> %a, float %b, iX
; Negative test – not safe to reduce vl
define <vscale x 4 x i32> @vslide1down_vx(<vscale x 4 x i32> %a, iXLen %b, iXLen %vl) {
-; NOVLOPT-LABEL: vslide1down_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vslide1down.vx v8, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vslide1down_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vslide1down.vx v8, v8, a0
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vslide1down_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m2, ta, ma
+; CHECK-NEXT: vslide1down.vx v8, v8, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vslide1down(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen %b, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
@@ -3722,1911 +2248,1152 @@ define <vscale x 4 x i32> @vslide1down_vx(<vscale x 4 x i32> %a, iXLen %b, iXLen
; Negative test – not safe to reduce vl
define <vscale x 4 x float> @vfslide1down_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfslide1down_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfslide1down.vf v8, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfslide1down_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vfslide1down.vf v8, v8, fa0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfslide1down_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfslide1down(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %1, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfadd_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfadd_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfadd_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfadd_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %b, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfadd_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfadd_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfadd_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfadd.vf v10, v8, fa0
-; VLOPT-NEXT: vfadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfadd_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vf v10, v8, fa0
+; CHECK-NEXT: vfadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %a, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfsub_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfsub_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfsub.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfsub_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfsub.vv v8, v8, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfsub_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsub.vv v8, v8, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfsub.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %b, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfsub_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfsub_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfsub.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfsub_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfsub.vf v10, v8, fa0
-; VLOPT-NEXT: vfadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfsub_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsub.vf v10, v8, fa0
+; CHECK-NEXT: vfadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfsub.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %a, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfrsub_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfrsub_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfrsub.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfrsub_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfrsub.vf v10, v8, fa0
-; VLOPT-NEXT: vfadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfrsub_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfrsub.vf v10, v8, fa0
+; CHECK-NEXT: vfadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfrsub.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %a, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x double> @vfwadd_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfwadd_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfwadd.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwadd_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfwadd.vv v12, v8, v10
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwadd_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfwadd.vv v12, v8, v10
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwadd.nxv4f64.nxv4f32.nxv4f32(<vscale x 4 x double> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %1, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwadd_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfwadd_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfwadd.vf v12, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwadd_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfwadd.vf v12, v8, fa0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwadd_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfwadd.vf v12, v8, fa0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwadd.nxv4f64.nxv4f32.f32(<vscale x 4 x double> poison, <vscale x 4 x float> %a, float %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %1, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwsub_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfwsub_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfwsub.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwsub_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfwsub.vv v12, v8, v10
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwsub_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfwsub.vv v12, v8, v10
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwsub.nxv4f64.nxv4f32.nxv4f32(<vscale x 4 x double> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %1, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwsub_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfwsub_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfwsub.vf v12, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwsub_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfwsub.vf v12, v8, fa0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwsub_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfwsub.vf v12, v8, fa0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwsub.nxv4f64.nxv4f32.f32(<vscale x 4 x double> poison, <vscale x 4 x float> %a, float %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %1, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwadd_wv(<vscale x 4 x double> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfwadd_wv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfwadd.wv v8, v8, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwadd_wv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfwadd.wv v8, v8, v12
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwadd_wv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfwadd.wv v8, v8, v12
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwadd.w.nxv4f64.nxv4f32.nxv4f32(<vscale x 4 x double> poison, <vscale x 4 x double> %a, <vscale x 4 x float> %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %1, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwadd_wf(<vscale x 4 x double> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfwadd_wf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfwadd.wf v8, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwadd_wf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfwadd.wf v8, v8, fa0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwadd_wf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfwadd.wf v8, v8, fa0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwadd.w.nxv4f64.nxv4f32.f32(<vscale x 4 x double> poison, <vscale x 4 x double> %a, float %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %1, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwsub_wv(<vscale x 4 x double> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfwsub_wv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfwsub.wv v8, v8, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwsub_wv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfwsub.wv v8, v8, v12
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwsub_wv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfwsub.wv v8, v8, v12
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwsub.w.nxv4f64.nxv4f32.nxv4f32(<vscale x 4 x double> poison, <vscale x 4 x double> %a, <vscale x 4 x float> %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %1, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwsub_wf(<vscale x 4 x double> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfwsub_wf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfwsub.wf v8, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwsub_wf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfwsub.wf v8, v8, fa0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwsub_wf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfwsub.wf v8, v8, fa0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwsub.w.nxv4f64.nxv4f32.f32(<vscale x 4 x double> poison, <vscale x 4 x double> %a, float %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %1, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x float> @vfmul_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfmul_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmul.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmul_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmul.vv v8, v8, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmul_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmul.vv v8, v8, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmul.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %b, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmul_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfmul_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmul.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmul_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmul.vf v10, v8, fa0
-; VLOPT-NEXT: vfadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmul_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmul.vf v10, v8, fa0
+; CHECK-NEXT: vfadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmul.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %a, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfdiv_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfdiv_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfdiv.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfdiv_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfdiv.vv v8, v8, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfdiv_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfdiv.vv v8, v8, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfdiv.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %b, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfdiv_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfdiv_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfdiv.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfdiv_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfdiv.vf v10, v8, fa0
-; VLOPT-NEXT: vfadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfdiv_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfdiv.vf v10, v8, fa0
+; CHECK-NEXT: vfadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfdiv.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %a, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfrdiv_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfrdiv_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfrdiv.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfrdiv_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfrdiv.vf v10, v8, fa0
-; VLOPT-NEXT: vfadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfrdiv_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfrdiv.vf v10, v8, fa0
+; CHECK-NEXT: vfadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfrdiv.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %a, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x double> @vfwmul_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfwmul_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfwmul.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwmul_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfwmul.vv v12, v8, v10
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwmul_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfwmul.vv v12, v8, v10
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwmul.nxv4f64.nxv4f32.nxv4f32(<vscale x 4 x double> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %1, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwmul_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfwmul_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfwmul.vf v12, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v12, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwmul_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfwmul.vf v12, v8, fa0
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v12, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwmul_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfwmul.vf v12, v8, fa0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v12, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwmul.nxv4f64.nxv4f32.f32(<vscale x 4 x double> poison, <vscale x 4 x float> %a, float %b, iXLen 7, iXLen -1)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %1, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x i1> @vmfeq_vf(<vscale x 4 x float> %a, <vscale x 4 x i1> %b, float%c, iXLen %vl) {
-; NOVLOPT-LABEL: vmfeq_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmfeq.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmfeq_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmfeq.vf v10, v8, fa0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmfeq_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vf v10, v8, fa0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4f32.f32(<vscale x 4 x float> %a, float %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmfeq_vv(<vscale x 4 x float> %a, <vscale x 4 x i1> %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmfeq_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmfeq.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v12, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmfeq_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmfeq.vv v12, v8, v10
-; VLOPT-NEXT: vmand.mm v0, v12, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmfeq_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vv v12, v8, v10
+; CHECK-NEXT: vmand.mm v0, v12, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4f32.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmfne_vf(<vscale x 4 x float> %a, <vscale x 4 x i1> %b, float%c, iXLen %vl) {
-; NOVLOPT-LABEL: vmfne_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmfne.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmfne_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmfne.vf v10, v8, fa0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmfne_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfne.vf v10, v8, fa0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4f32.f32(<vscale x 4 x float> %a, float %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmfne_vv(<vscale x 4 x float> %a, <vscale x 4 x i1> %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmfne_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmfne.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v12, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmfne_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmfne.vv v12, v8, v10
-; VLOPT-NEXT: vmand.mm v0, v12, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmfne_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfne.vv v12, v8, v10
+; CHECK-NEXT: vmand.mm v0, v12, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4f32.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmflt_vf(<vscale x 4 x float> %a, <vscale x 4 x i1> %b, float%c, iXLen %vl) {
-; NOVLOPT-LABEL: vmflt_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmflt.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmflt_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmflt.vf v10, v8, fa0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmflt_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmflt.vf v10, v8, fa0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4f32.f32(<vscale x 4 x float> %a, float %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmflt_vv(<vscale x 4 x float> %a, <vscale x 4 x i1> %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmflt_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmflt.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v12, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmflt_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmflt.vv v12, v8, v10
-; VLOPT-NEXT: vmand.mm v0, v12, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmflt_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmflt.vv v12, v8, v10
+; CHECK-NEXT: vmand.mm v0, v12, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4f32.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmfle_vf(<vscale x 4 x float> %a, <vscale x 4 x i1> %b, float%c, iXLen %vl) {
-; NOVLOPT-LABEL: vmfle_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmfle.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmfle_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmfle.vf v10, v8, fa0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmfle_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfle.vf v10, v8, fa0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4f32.f32(<vscale x 4 x float> %a, float %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmfle_vv(<vscale x 4 x float> %a, <vscale x 4 x i1> %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmfle_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmfle.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v12, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmfle_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmfle.vv v12, v8, v10
-; VLOPT-NEXT: vmand.mm v0, v12, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmfle_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfle.vv v12, v8, v10
+; CHECK-NEXT: vmand.mm v0, v12, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4f32.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmfgt_vf(<vscale x 4 x float> %a, <vscale x 4 x i1> %b, float%c, iXLen %vl) {
-; NOVLOPT-LABEL: vmfgt_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmfgt.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v10, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmfgt_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmfgt.vf v10, v8, fa0
-; VLOPT-NEXT: vmand.mm v0, v10, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmfgt_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfgt.vf v10, v8, fa0
+; CHECK-NEXT: vmand.mm v0, v10, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4f32.f32(<vscale x 4 x float> %a, float %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i1> @vmfgt_vv(<vscale x 4 x float> %a, <vscale x 4 x i1> %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmfgt_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmflt.vv v12, v10, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; NOVLOPT-NEXT: vmand.mm v0, v12, v0
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmfgt_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmflt.vv v12, v10, v8
-; VLOPT-NEXT: vmand.mm v0, v12, v0
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmfgt_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmflt.vv v12, v10, v8
+; CHECK-NEXT: vmand.mm v0, v12, v0
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4f32.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %c, iXLen -1)
%2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
ret <vscale x 4 x i1> %2
}
define <vscale x 4 x i32> @vmerge_vvm(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmerge_vvm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmerge.vvm v8, v8, v10, v0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmerge_vvm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmerge.vvm v8, v8, v10, v0
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmerge_vvm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %c, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmerge_vxm(<vscale x 4 x i32> %a, i32 %b, <vscale x 4 x i1> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmerge_vxm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmerge.vxm v8, v8, a0, v0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmerge_vxm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vmerge.vxm v8, v8, a0, v0
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmerge_vxm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, <vscale x 4 x i1> %c, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vmerge_vim(<vscale x 4 x i32> %a, <vscale x 4 x i1> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vmerge_vim:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmerge.vim v8, v8, 9, v0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vmerge_vim:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vmerge.vim v8, v8, 9, v0
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vmerge_vim:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 9, v0
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 9, <vscale x 4 x i1> %c, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vadc_vvm(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vadc_vvm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadc.vvm v8, v8, v10, v0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vadc_vvm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vadc.vvm v8, v8, v10, v0
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vadc_vvm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vadc.vvm v8, v8, v10, v0
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %c, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vadc_vxm(<vscale x 4 x i32> %a, i32 %b, <vscale x 4 x i1> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vadc_vxm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadc.vxm v8, v8, a0, v0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vadc_vxm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vadc.vxm v8, v8, a0, v0
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vadc_vxm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vadc.vxm v8, v8, a0, v0
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, <vscale x 4 x i1> %c, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vadc_vim(<vscale x 4 x i32> %a, <vscale x 4 x i1> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vadc_vim:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadc.vim v8, v8, 9, v0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vadc_vim:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vadc.vim v8, v8, 9, v0
-; VLOPT-NEXT: vadd.vv v8, v8, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vadc_vim:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vadc.vim v8, v8, 9, v0
+; CHECK-NEXT: vadd.vv v8, v8, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 9, <vscale x 4 x i1> %c, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vaadd_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vaadd_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vaadd.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vaadd_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vaadd.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vaadd_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vaadd.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vaadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vaadd_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vaadd_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vaadd.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vaadd_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vaadd.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vaadd_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vaadd.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vaadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vasub_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vasub_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vasub.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vasub_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vasub.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vasub_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vasub.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vasub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vasub_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vasub_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vasub.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vasub_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vasub.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vasub_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vasub.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vasub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vaaddu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vaaddu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vaaddu.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vaaddu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vaaddu.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vaaddu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vaaddu.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vaaddu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vaaddu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vaaddu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vaaddu.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vaaddu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vaaddu.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vaaddu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vaaddu.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vaaddu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vasubu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vasubu_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vasubu.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vasubu_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vasubu.vv v8, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vasubu_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vasubu.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vasubu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vasubu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
-; NOVLOPT-LABEL: vasubu_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vasubu.vx v10, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vasubu_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vasubu.vx v10, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vasubu_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vasubu.vx v10, v8, a0
+; CHECK-NEXT: vadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vasubu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen 0, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x float> @vfmax_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfmax_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmax.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmax_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmax.vv v8, v8, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmax_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmax.vv v8, v8, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmax.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %b, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmax_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfmax_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmax.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmax_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmax.vf v10, v8, fa0
-; VLOPT-NEXT: vfadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmax_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmax.vf v10, v8, fa0
+; CHECK-NEXT: vfadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmax.nxv4f32.f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %a, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmin_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfmin_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmin.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmin_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmin.vv v8, v8, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmin_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmin.vv v8, v8, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmin.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %b, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmin_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfmin_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmin.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmin_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmin.vf v10, v8, fa0
-; VLOPT-NEXT: vfadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmin_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmin.vf v10, v8, fa0
+; CHECK-NEXT: vfadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmin.nxv4f32.f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %a, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfsgnj_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfsgnj_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfsgnj.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfsgnj_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfsgnj.vv v8, v8, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfsgnj_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsgnj.vv v8, v8, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfsgnj.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %b, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfsgnj_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfsgnj_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfsgnj.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfsgnj_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfsgnj.vf v10, v8, fa0
-; VLOPT-NEXT: vfadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfsgnj_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsgnj.vf v10, v8, fa0
+; CHECK-NEXT: vfadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfsgnj.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %a, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfsgnjn_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfsgnjn_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfsgnjn.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfsgnjn_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfsgnjn.vv v8, v8, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfsgnjn_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsgnjn.vv v8, v8, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfsgnjn.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %b, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfsgnjn_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfsgnjn_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfsgnjn.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfsgnjn_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfsgnjn.vf v10, v8, fa0
-; VLOPT-NEXT: vfadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfsgnjn_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsgnjn.vf v10, v8, fa0
+; CHECK-NEXT: vfadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfsgnjn.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %a, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfsgnjx_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfsgnjx_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfsgnjx.vv v8, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfsgnjx_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfsgnjx.vv v8, v8, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfsgnjx_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsgnjx.vv v8, v8, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfsgnjx.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %b, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %b, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfsgnjx_vf(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfsgnjx_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfsgnjx.vf v10, v8, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfsgnjx_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfsgnjx.vf v10, v8, fa0
-; VLOPT-NEXT: vfadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfsgnjx_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsgnjx.vf v10, v8, fa0
+; CHECK-NEXT: vfadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfsgnjx.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %a, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmerge_vf(<vscale x 4 x float> %a, float %b, <vscale x 4 x i1> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfmerge_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmerge.vfm v10, v8, fa0, v0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmerge_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmerge.vfm v10, v8, fa0, v0
-; VLOPT-NEXT: vfadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmerge_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmerge.vfm v10, v8, fa0, v0
+; CHECK-NEXT: vfadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmerge(<vscale x 4 x float> poison, <vscale x 4 x float> %a, float %b, <vscale x 4 x i1> %c, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %a, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmv_v_f(<vscale x 4 x float> %a, float %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfmv_v_f:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmv.v.f v10, fa0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v10, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmv_v_f:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmv.v.f v10, fa0
-; VLOPT-NEXT: vfadd.vv v8, v10, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmv_v_f:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmv.v.f v10, fa0
+; CHECK-NEXT: vfadd.vv v8, v10, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmv.v.f(<vscale x 4 x float> poison, float %b, iXLen -1)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %a, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmacc_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfmacc_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmacc.vv v8, v12, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmacc_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmacc.vv v8, v12, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmacc_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmacc.vv v8, v12, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmacc(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmacc_vf(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfmacc_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmacc.vf v8, fa0, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmacc_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmacc.vf v8, fa0, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmacc_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmacc.vf v8, fa0, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmacc(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfnmacc_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfnmacc_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfnmacc.vv v8, v12, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfnmacc_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfnmacc.vv v8, v12, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfnmacc_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfnmacc.vv v8, v12, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfnmacc(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfnmacc_vf(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfnmacc_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfnmacc.vf v8, fa0, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfnmacc_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfnmacc.vf v8, fa0, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfnmacc_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfnmacc.vf v8, fa0, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfnmacc(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmsac_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfmsac_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmsac.vv v8, v12, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmsac_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmsac.vv v8, v12, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmsac_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmsac.vv v8, v12, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmsac(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmsac_vf(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfmsac_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmsac.vf v8, fa0, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmsac_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmsac.vf v8, fa0, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmsac_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmsac.vf v8, fa0, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmsac(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfnmsac_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfnmsac_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfnmsac.vv v8, v12, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfnmsac_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfnmsac.vv v8, v12, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfnmsac_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfnmsac.vv v8, v12, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfnmsac(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfnmsac_vf(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfnmsac_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfnmsac.vf v8, fa0, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfnmsac_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfnmsac.vf v8, fa0, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfnmsac_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfnmsac.vf v8, fa0, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfnmsac(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmadd_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfmadd_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmadd.vv v8, v10, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmadd_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmadd.vv v8, v10, v12
-; VLOPT-NEXT: vfadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmadd_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmadd.vv v8, v10, v12
+; CHECK-NEXT: vfadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmadd(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmadd_vf(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfmadd_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmadd.vf v8, fa0, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmadd_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmadd.vf v8, fa0, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmadd_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmadd.vf v8, fa0, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmadd(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfnmadd_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfnmadd_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfnmadd.vv v8, v10, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfnmadd_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfnmadd.vv v8, v10, v12
-; VLOPT-NEXT: vfadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfnmadd_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfnmadd.vv v8, v10, v12
+; CHECK-NEXT: vfadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfnmadd(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfnmadd_vf(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfnmadd_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfnmadd.vf v8, fa0, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfnmadd_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfnmadd.vf v8, fa0, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfnmadd_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfnmadd.vf v8, fa0, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfnmadd(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmsub_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfmsub_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmsub.vv v8, v10, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmsub_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmsub.vv v8, v10, v12
-; VLOPT-NEXT: vfadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmsub_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmsub.vv v8, v10, v12
+; CHECK-NEXT: vfadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmsub(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfmsub_vf(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfmsub_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfmsub.vf v8, fa0, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfmsub_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfmsub.vf v8, fa0, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfmsub_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmsub.vf v8, fa0, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfmsub(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfnmsub_vv(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfnmsub_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfnmsub.vv v8, v10, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfnmsub_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfnmsub.vv v8, v10, v12
-; VLOPT-NEXT: vfadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfnmsub_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfnmsub.vv v8, v10, v12
+; CHECK-NEXT: vfadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfnmsub(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x float> @vfnmsub_vf(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vfnmsub_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfnmsub.vf v8, fa0, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfnmsub_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfnmsub.vf v8, fa0, v10
-; VLOPT-NEXT: vfadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfnmsub_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfnmsub.vf v8, fa0, v10
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfnmsub(<vscale x 4 x float> %a, float %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 3)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %c, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x double> @vfwmacc_vv(<vscale x 4 x double> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, <vscale x 4 x double> %d, iXLen %vl) {
-; NOVLOPT-LABEL: vfwmacc_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vfwmacc.vv v8, v12, v14
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v16
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwmacc_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
-; VLOPT-NEXT: vfwmacc.vv v8, v12, v14
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v16
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwmacc_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: vfwmacc.vv v8, v12, v14
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v16
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwmacc(<vscale x 4 x double> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %d, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwmacc_vf(<vscale x 4 x double> %a, float %b, <vscale x 4 x float> %c, <vscale x 4 x double> %d, iXLen %vl) {
-; NOVLOPT-LABEL: vfwmacc_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vfwmacc.vf v8, fa0, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v16
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwmacc_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
-; VLOPT-NEXT: vfwmacc.vf v8, fa0, v12
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v16
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwmacc_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: vfwmacc.vf v8, fa0, v12
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v16
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwmacc(<vscale x 4 x double> %a, float %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %d, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwnmacc_vv(<vscale x 4 x double> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, <vscale x 4 x double> %d, iXLen %vl) {
-; NOVLOPT-LABEL: vfwnmacc_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vfwnmacc.vv v8, v12, v14
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v16
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwnmacc_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
-; VLOPT-NEXT: vfwnmacc.vv v8, v12, v14
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v16
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwnmacc_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: vfwnmacc.vv v8, v12, v14
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v16
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwnmacc(<vscale x 4 x double> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %d, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwnmacc_vf(<vscale x 4 x double> %a, float %b, <vscale x 4 x float> %c, <vscale x 4 x double> %d, iXLen %vl) {
-; NOVLOPT-LABEL: vfwnmacc_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vfwnmacc.vf v8, fa0, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v16
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwnmacc_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
-; VLOPT-NEXT: vfwnmacc.vf v8, fa0, v12
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v16
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwnmacc_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: vfwnmacc.vf v8, fa0, v12
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v16
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwnmacc(<vscale x 4 x double> %a, float %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %d, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwmsac_vv(<vscale x 4 x double> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, <vscale x 4 x double> %d, iXLen %vl) {
-; NOVLOPT-LABEL: vfwmsac_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vfwmsac.vv v8, v12, v14
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v16
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwmsac_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
-; VLOPT-NEXT: vfwmsac.vv v8, v12, v14
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v16
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwmsac_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: vfwmsac.vv v8, v12, v14
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v16
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwmsac(<vscale x 4 x double> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %d, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwmsac_vf(<vscale x 4 x double> %a, float %b, <vscale x 4 x float> %c, <vscale x 4 x double> %d, iXLen %vl) {
-; NOVLOPT-LABEL: vfwmsac_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vfwmsac.vf v8, fa0, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v16
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwmsac_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
-; VLOPT-NEXT: vfwmsac.vf v8, fa0, v12
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v16
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwmsac_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: vfwmsac.vf v8, fa0, v12
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v16
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwmsac(<vscale x 4 x double> %a, float %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %d, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwnmsac_vv(<vscale x 4 x double> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, <vscale x 4 x double> %d, iXLen %vl) {
-; NOVLOPT-LABEL: vfwnmsac_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vfwnmsac.vv v8, v12, v14
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v16
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwnmsac_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
-; VLOPT-NEXT: vfwnmsac.vv v8, v12, v14
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v16
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwnmsac_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: vfwnmsac.vv v8, v12, v14
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v16
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwnmsac(<vscale x 4 x double> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %d, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfwnmsac_vf(<vscale x 4 x double> %a, float %b, <vscale x 4 x float> %c, <vscale x 4 x double> %d, iXLen %vl) {
-; NOVLOPT-LABEL: vfwnmsac_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma
-; NOVLOPT-NEXT: vfwnmsac.vf v8, fa0, v12
-; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v16
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwnmsac_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
-; VLOPT-NEXT: vfwnmsac.vf v8, fa0, v12
-; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v16
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwnmsac_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: vfwnmsac.vf v8, fa0, v12
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v16
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x double> @llvm.riscv.vfwnmsac(<vscale x 4 x double> %a, float %b, <vscale x 4 x float> %c, iXLen 7, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x double> @llvm.riscv.vfadd(<vscale x 4 x double> poison, <vscale x 4 x double> %1, <vscale x 4 x double> %d, iXLen 7, iXLen %vl)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x float> @vfwmaccbf16_vv(<vscale x 4 x float> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c, <vscale x 4 x float> %d, iXLen %vl) {
-; NOVLOPT-LABEL: vfwmaccbf16_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
-; NOVLOPT-NEXT: vfwmaccbf16.vv v8, v10, v11
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwmaccbf16_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
-; VLOPT-NEXT: vfwmaccbf16.vv v8, v10, v11
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwmaccbf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vfwmaccbf16.vv v8, v10, v11
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfwmaccbf16(<vscale x 4 x float> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c, iXLen 7, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %d, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x i32> @vsbc_vvm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, iXLen %vl) {
-; NOVLOPT-LABEL: vsbc_vvm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsbc.vvm v8, v8, v10, v0
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsbc_vvm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vsbc.vvm v8, v8, v10, v0
-; VLOPT-NEXT: vadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsbc_vvm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vsbc.vvm v8, v8, v10, v0
+; CHECK-NEXT: vadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32.nxv4i1(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %c, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vsbc_vxm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %b, i32 %c, iXLen %vl) {
-; NOVLOPT-LABEL: vsbc_vxm:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vsbc.vxm v8, v8, a0, v0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vsbc_vxm:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vsbc.vxm v8, v8, a0, v0
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vsbc_vxm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vsbc.vxm v8, v8, a0, v0
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32.nxv4i1(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %c, <vscale x 4 x i1> %mask, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vfclass_v(<vscale x 4 x float> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vfclass_v:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfclass.v v8, v8
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfclass_v:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vfclass.v v8, v8
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfclass_v:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfclass.v v8, v8
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vfclass.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x float> %a, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vrgather_vi(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vrgather_vi:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vrgather.vi v12, v8, 5
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vrgather_vi:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vrgather.vi v12, v8, 5
-; VLOPT-NEXT: vadd.vv v8, v12, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vrgather_vi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vrgather.vi v12, v8, 5
+; CHECK-NEXT: vadd.vv v8, v12, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vrgather_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %idx, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vrgather_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vrgather.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vrgather_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vrgather.vv v12, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v12, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vrgather_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vrgather.vv v12, v8, v10
+; CHECK-NEXT: vadd.vv v8, v12, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %idx, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vrgather_vx(<vscale x 4 x i32> %a, iXLen %idx, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vrgather_vx:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vrgather.vx v12, v8, a0
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vrgather_vx:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vrgather.vx v12, v8, a0
-; VLOPT-NEXT: vadd.vv v8, v12, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vrgather_vx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vrgather.vx v12, v8, a0
+; CHECK-NEXT: vadd.vv v8, v12, v10
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen %idx, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x i32> @vrgatherei16_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %idx, <vscale x 4 x i32> %b, iXLen %vl) {
-; NOVLOPT-LABEL: vrgatherei16_vv:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vrgatherei16.vv v12, v8, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v12, v8
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vrgatherei16_vv:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; VLOPT-NEXT: vrgatherei16.vv v12, v8, v10
-; VLOPT-NEXT: vadd.vv v8, v12, v8
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vrgatherei16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v12, v8, v10
+; CHECK-NEXT: vadd.vv v8, v12, v8
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i16> %idx, iXLen -1)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
ret <vscale x 4 x i32> %2
}
define <vscale x 4 x float> @vfwmaccbf16_vf(<vscale x 4 x float> %a, bfloat %b, <vscale x 4 x bfloat> %c, <vscale x 4 x float> %d, iXLen %vl) {
-; NOVLOPT-LABEL: vfwmaccbf16_vf:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
-; NOVLOPT-NEXT: vfwmaccbf16.vf v8, fa0, v10
-; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfwmaccbf16_vf:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
-; VLOPT-NEXT: vfwmaccbf16.vf v8, fa0, v10
-; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfwmaccbf16_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vfwmaccbf16.vf v8, fa0, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v12
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfwmaccbf16(<vscale x 4 x float> %a, bfloat %b, <vscale x 4 x bfloat> %c, iXLen 7, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x float> @llvm.riscv.vfadd(<vscale x 4 x float> poison, <vscale x 4 x float> %1, <vscale x 4 x float> %d, iXLen 7, iXLen %vl)
ret <vscale x 4 x float> %2
}
define <vscale x 4 x double> @vfsqrt(<vscale x 4 x float> %a) {
-; NOVLOPT-LABEL: vfsqrt:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetivli zero, 7, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmv2r.v v12, v8
-; NOVLOPT-NEXT: fsrmi a0, 0
-; NOVLOPT-NEXT: vfsqrt.v v14, v8
-; NOVLOPT-NEXT: fsrm a0
-; NOVLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfwmacc.vv v8, v12, v14
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfsqrt:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma
-; VLOPT-NEXT: vmv2r.v v12, v8
-; VLOPT-NEXT: fsrmi a0, 0
-; VLOPT-NEXT: vfsqrt.v v14, v8
-; VLOPT-NEXT: fsrm a0
-; VLOPT-NEXT: vfwmacc.vv v8, v12, v14
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfsqrt:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma
+; CHECK-NEXT: vmv2r.v v12, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfsqrt.v v14, v8
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfwmacc.vv v8, v12, v14
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfsqrt.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, iXLen 0, iXLen 7)
%2 = call <vscale x 4 x double> @llvm.riscv.vfwmacc(<vscale x 4 x double> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %1, iXLen 7, iXLen 6, iXLen 0)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfrsqrt7(<vscale x 4 x float> %a) {
-; NOVLOPT-LABEL: vfrsqrt7:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetivli zero, 7, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmv2r.v v12, v8
-; NOVLOPT-NEXT: vfrsqrt7.v v14, v8
-; NOVLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfwmacc.vv v8, v12, v14
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfrsqrt7:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma
-; VLOPT-NEXT: vmv2r.v v12, v8
-; VLOPT-NEXT: vfrsqrt7.v v14, v8
-; VLOPT-NEXT: vfwmacc.vv v8, v12, v14
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfrsqrt7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma
+; CHECK-NEXT: vmv2r.v v12, v8
+; CHECK-NEXT: vfrsqrt7.v v14, v8
+; CHECK-NEXT: vfwmacc.vv v8, v12, v14
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfrsqrt7.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, iXLen 7)
%2 = call <vscale x 4 x double> @llvm.riscv.vfwmacc(<vscale x 4 x double> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %1, iXLen 7, iXLen 6, iXLen 0)
ret <vscale x 4 x double> %2
}
define <vscale x 4 x double> @vfrec7(<vscale x 4 x float> %a) {
-; NOVLOPT-LABEL: vfrec7:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetivli zero, 7, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmv2r.v v12, v8
-; NOVLOPT-NEXT: fsrmi a0, 0
-; NOVLOPT-NEXT: vfrec7.v v14, v8
-; NOVLOPT-NEXT: fsrm a0
-; NOVLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfwmacc.vv v8, v12, v14
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vfrec7:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma
-; VLOPT-NEXT: vmv2r.v v12, v8
-; VLOPT-NEXT: fsrmi a0, 0
-; VLOPT-NEXT: vfrec7.v v14, v8
-; VLOPT-NEXT: fsrm a0
-; VLOPT-NEXT: vfwmacc.vv v8, v12, v14
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vfrec7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma
+; CHECK-NEXT: vmv2r.v v12, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfrec7.v v14, v8
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfwmacc.vv v8, v12, v14
+; CHECK-NEXT: ret
%1 = call <vscale x 4 x float> @llvm.riscv.vfrec7.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> %a, iXLen 0, iXLen 7)
%2 = call <vscale x 4 x double> @llvm.riscv.vfwmacc(<vscale x 4 x double> poison, <vscale x 4 x float> %a, <vscale x 4 x float> %1, iXLen 7, iXLen 6, iXLen 0)
ret <vscale x 4 x double> %2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-no-prop.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-no-prop.ll
index 8507254..e1f641a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-no-prop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-no-prop.ll
@@ -1,12 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN: | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN: | FileCheck %s
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -riscv-enable-vl-optimizer \
-; RUN: -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v -riscv-enable-vl-optimizer \
-; RUN: -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s
declare <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, iXLen)
declare <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.iXLen(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll
index 938f575..545fcc9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll
@@ -1,12 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvl512b -verify-machineinstrs \
-; RUN: -riscv-enable-vl-optimizer=false | FileCheck %s -check-prefixes=CHECK,NOVLOPT
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvl512b -verify-machineinstrs \
-; RUN: -riscv-enable-vl-optimizer=false | FileCheck %s -check-prefixes=CHECK,NOVLOPT
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvl512b -riscv-enable-vl-optimizer \
-; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,VLOPT
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvl512b -riscv-enable-vl-optimizer \
-; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,VLOPT
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvl512b -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvl512b -verify-machineinstrs | FileCheck %s
define <2 x i32> @vdot_lane_s32(<2 x i32> noundef %var_1, <8 x i8> noundef %var_3, <8 x i8> noundef %var_5, <8 x i16> %x) {
; CHECK-LABEL: vdot_lane_s32:
@@ -40,20 +34,12 @@ declare <vscale x 2 x i16> @llvm.riscv.vnsrl.nxv2i16.nxv2i32.nxv2i16(
iXLen);
define <vscale x 2 x i16> @intrinsic_vnsrl_wv_nxv2i16_nxv2i32_nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, iXLen %2, <vscale x 2 x i32> %3, <vscale x 2 x i32> %4, <vscale x 2 x i16> %z) nounwind {
-; NOVLOPT-LABEL: intrinsic_vnsrl_wv_nxv2i16_nxv2i32_nxv2i16:
-; NOVLOPT: # %bb.0: # %entry
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
-; NOVLOPT-NEXT: vwadd.vv v10, v8, v9
-; NOVLOPT-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; NOVLOPT-NEXT: vnsrl.wv v8, v10, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: intrinsic_vnsrl_wv_nxv2i16_nxv2i32_nxv2i16:
-; VLOPT: # %bb.0: # %entry
-; VLOPT-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; VLOPT-NEXT: vwadd.vv v10, v8, v9
-; VLOPT-NEXT: vnsrl.wv v8, v10, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: intrinsic_vnsrl_wv_nxv2i16_nxv2i32_nxv2i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT: vwadd.vv v10, v8, v9
+; CHECK-NEXT: vnsrl.wv v8, v10, v12
+; CHECK-NEXT: ret
entry:
%c = sext <vscale x 2 x i16> %a to <vscale x 2 x i32>
%d = sext <vscale x 2 x i16> %b to <vscale x 2 x i32>
@@ -74,22 +60,13 @@ declare <vscale x 2 x i16> @llvm.riscv.vnclip.nxv2i16.nxv2i32.nxv2i16(
iXLen, iXLen);
define <vscale x 2 x i16> @vnclip(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, iXLen %2, <vscale x 2 x i32> %3, <vscale x 2 x i32> %4, <vscale x 2 x i16> %z) nounwind {
-; NOVLOPT-LABEL: vnclip:
-; NOVLOPT: # %bb.0: # %entry
-; NOVLOPT-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
-; NOVLOPT-NEXT: vwadd.vv v10, v8, v9
-; NOVLOPT-NEXT: csrwi vxrm, 0
-; NOVLOPT-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; NOVLOPT-NEXT: vnclip.wv v8, v10, v12
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vnclip:
-; VLOPT: # %bb.0: # %entry
-; VLOPT-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; VLOPT-NEXT: vwadd.vv v10, v8, v9
-; VLOPT-NEXT: csrwi vxrm, 0
-; VLOPT-NEXT: vnclip.wv v8, v10, v12
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vnclip:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT: vwadd.vv v10, v8, v9
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vnclip.wv v8, v10, v12
+; CHECK-NEXT: ret
entry:
%c = sext <vscale x 2 x i16> %a to <vscale x 2 x i32>
%d = sext <vscale x 2 x i16> %b to <vscale x 2 x i32>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
index 823c2bb..cd282c2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
@@ -1,50 +1,28 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN: -riscv-enable-vl-optimizer=false | FileCheck %s -check-prefixes=CHECK,NOVLOPT
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN: -riscv-enable-vl-optimizer=false | FileCheck %s -check-prefixes=CHECK,NOVLOPT
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -riscv-enable-vl-optimizer \
-; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,VLOPT
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v -riscv-enable-vl-optimizer \
-; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,VLOPT
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s
declare <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, iXLen)
define <vscale x 4 x i32> @different_imm_vl_with_ta(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
-; NOVLOPT-LABEL: different_imm_vl_with_ta:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetivli zero, 5, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v12
-; NOVLOPT-NEXT: vsetivli zero, 4, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: different_imm_vl_with_ta:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetivli zero, 4, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v10, v12
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: different_imm_vl_with_ta:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v10, v12
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 5)
%w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a, iXLen 4)
ret <vscale x 4 x i32> %w
}
define <vscale x 4 x i32> @vlmax_and_imm_vl_with_ta(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
-; NOVLOPT-LABEL: vlmax_and_imm_vl_with_ta:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a0, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v12
-; NOVLOPT-NEXT: vsetivli zero, 4, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v8, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: vlmax_and_imm_vl_with_ta:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetivli zero, 4, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v10, v12
-; VLOPT-NEXT: vadd.vv v8, v8, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: vlmax_and_imm_vl_with_ta:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v10, v12
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: ret
%v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
%w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a, iXLen 4)
ret <vscale x 4 x i32> %w
@@ -126,22 +104,13 @@ define <vscale x 4 x i32> @different_vl_with_tu(<vscale x 4 x i32> %passthru, <v
; We can propagate VL to a tail-undisturbed policy, provided none of its users
; are passthrus (i.e. read past VL).
define <vscale x 4 x i32> @different_imm_vl_with_tu(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
-; NOVLOPT-LABEL: different_imm_vl_with_tu:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; NOVLOPT-NEXT: vmv2r.v v14, v10
-; NOVLOPT-NEXT: vadd.vv v14, v10, v12
-; NOVLOPT-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; NOVLOPT-NEXT: vadd.vv v8, v14, v10
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: different_imm_vl_with_tu:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; VLOPT-NEXT: vmv2r.v v14, v10
-; VLOPT-NEXT: vadd.vv v14, v10, v12
-; VLOPT-NEXT: vadd.vv v8, v14, v10
-; VLOPT-NEXT: ret
+; CHECK-LABEL: different_imm_vl_with_tu:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-NEXT: vmv2r.v v14, v10
+; CHECK-NEXT: vadd.vv v14, v10, v12
+; CHECK-NEXT: vadd.vv v8, v14, v10
+; CHECK-NEXT: ret
%v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 5)
%w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a, iXLen 4)
ret <vscale x 4 x i32> %w
@@ -195,22 +164,13 @@ define <vscale x 4 x i32> @dont_optimize_tied_def(<vscale x 4 x i32> %a, <vscale
}
define void @optimize_ternary_use(<vscale x 4 x i16> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, ptr %p, iXLen %vl) {
-; NOVLOPT-LABEL: optimize_ternary_use:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vzext.vf2 v14, v8
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vmadd.vv v14, v10, v12
-; NOVLOPT-NEXT: vse32.v v14, (a0)
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: optimize_ternary_use:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vzext.vf2 v14, v8
-; VLOPT-NEXT: vmadd.vv v14, v10, v12
-; VLOPT-NEXT: vse32.v v14, (a0)
-; VLOPT-NEXT: ret
+; CHECK-LABEL: optimize_ternary_use:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vzext.vf2 v14, v8
+; CHECK-NEXT: vmadd.vv v14, v10, v12
+; CHECK-NEXT: vse32.v v14, (a0)
+; CHECK-NEXT: ret
%1 = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
%2 = mul <vscale x 4 x i32> %b, %1
%3 = add <vscale x 4 x i32> %2, %c
@@ -221,28 +181,16 @@ define void @optimize_ternary_use(<vscale x 4 x i16> %a, <vscale x 4 x i32> %b,
; This function has a copy between two vrm2 virtual registers, make sure we can
; reduce vl between it.
define void @fadd_fcmp_select_copy(<vscale x 4 x float> %v, <vscale x 4 x i1> %c, ptr %p, iXLen %vl) {
-; NOVLOPT-LABEL: fadd_fcmp_select_copy:
-; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; NOVLOPT-NEXT: vfadd.vv v8, v8, v8
-; NOVLOPT-NEXT: fmv.w.x fa5, zero
-; NOVLOPT-NEXT: vmflt.vf v10, v8, fa5
-; NOVLOPT-NEXT: vmand.mm v10, v0, v10
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vse32.v v8, (a0)
-; NOVLOPT-NEXT: vsm.v v10, (a0)
-; NOVLOPT-NEXT: ret
-;
-; VLOPT-LABEL: fadd_fcmp_select_copy:
-; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; VLOPT-NEXT: vfadd.vv v8, v8, v8
-; VLOPT-NEXT: fmv.w.x fa5, zero
-; VLOPT-NEXT: vmflt.vf v10, v8, fa5
-; VLOPT-NEXT: vmand.mm v10, v0, v10
-; VLOPT-NEXT: vse32.v v8, (a0)
-; VLOPT-NEXT: vsm.v v10, (a0)
-; VLOPT-NEXT: ret
+; CHECK-LABEL: fadd_fcmp_select_copy:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v8
+; CHECK-NEXT: fmv.w.x fa5, zero
+; CHECK-NEXT: vmflt.vf v10, v8, fa5
+; CHECK-NEXT: vmand.mm v10, v0, v10
+; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: vsm.v v10, (a0)
+; CHECK-NEXT: ret
%fadd = fadd <vscale x 4 x float> %v, %v
%fcmp = fcmp olt <vscale x 4 x float> %fadd, zeroinitializer
%select = select <vscale x 4 x i1> %c, <vscale x 4 x i1> %fcmp, <vscale x 4 x i1> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll b/llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll
index a14268a..4b9f9a0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vlopt-same-vl.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-enable-vl-optimizer \
+; RUN: llc -mtriple=riscv64 -mattr=+v \
; RUN: -verify-machineinstrs -debug-only=riscv-vl-optimizer -o - 2>&1 %s | FileCheck %s
; REQUIRES: asserts
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
index 25a226e..eb129da 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
@@ -959,7 +959,7 @@ define <vscale x 1 x i64> @vrol_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) {
; CHECK-RV64-LABEL: vrol_vx_nxv1i64:
; CHECK-RV64: # %bb.0:
; CHECK-RV64-NEXT: andi a1, a0, 63
-; CHECK-RV64-NEXT: negw a0, a0
+; CHECK-RV64-NEXT: neg a0, a0
; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; CHECK-RV64-NEXT: vsll.vx v9, v8, a1
; CHECK-RV64-NEXT: andi a0, a0, 63
@@ -1022,7 +1022,7 @@ define <vscale x 2 x i64> @vrol_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) {
; CHECK-RV64-LABEL: vrol_vx_nxv2i64:
; CHECK-RV64: # %bb.0:
; CHECK-RV64-NEXT: andi a1, a0, 63
-; CHECK-RV64-NEXT: negw a0, a0
+; CHECK-RV64-NEXT: neg a0, a0
; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; CHECK-RV64-NEXT: vsll.vx v10, v8, a1
; CHECK-RV64-NEXT: andi a0, a0, 63
@@ -1085,7 +1085,7 @@ define <vscale x 4 x i64> @vrol_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) {
; CHECK-RV64-LABEL: vrol_vx_nxv4i64:
; CHECK-RV64: # %bb.0:
; CHECK-RV64-NEXT: andi a1, a0, 63
-; CHECK-RV64-NEXT: negw a0, a0
+; CHECK-RV64-NEXT: neg a0, a0
; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; CHECK-RV64-NEXT: vsll.vx v12, v8, a1
; CHECK-RV64-NEXT: andi a0, a0, 63
@@ -1148,7 +1148,7 @@ define <vscale x 8 x i64> @vrol_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
; CHECK-RV64-LABEL: vrol_vx_nxv8i64:
; CHECK-RV64: # %bb.0:
; CHECK-RV64-NEXT: andi a1, a0, 63
-; CHECK-RV64-NEXT: negw a0, a0
+; CHECK-RV64-NEXT: neg a0, a0
; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; CHECK-RV64-NEXT: vsll.vx v16, v8, a1
; CHECK-RV64-NEXT: andi a0, a0, 63
diff --git a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
index 9e63b61..97524ac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
@@ -1626,7 +1626,7 @@ define <vscale x 1 x i64> @vror_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) {
; CHECK-RV64-LABEL: vror_vx_nxv1i64:
; CHECK-RV64: # %bb.0:
; CHECK-RV64-NEXT: andi a1, a0, 63
-; CHECK-RV64-NEXT: negw a0, a0
+; CHECK-RV64-NEXT: neg a0, a0
; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; CHECK-RV64-NEXT: vsrl.vx v9, v8, a1
; CHECK-RV64-NEXT: andi a0, a0, 63
@@ -1728,7 +1728,7 @@ define <vscale x 2 x i64> @vror_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) {
; CHECK-RV64-LABEL: vror_vx_nxv2i64:
; CHECK-RV64: # %bb.0:
; CHECK-RV64-NEXT: andi a1, a0, 63
-; CHECK-RV64-NEXT: negw a0, a0
+; CHECK-RV64-NEXT: neg a0, a0
; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma
; CHECK-RV64-NEXT: vsrl.vx v10, v8, a1
; CHECK-RV64-NEXT: andi a0, a0, 63
@@ -1830,7 +1830,7 @@ define <vscale x 4 x i64> @vror_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) {
; CHECK-RV64-LABEL: vror_vx_nxv4i64:
; CHECK-RV64: # %bb.0:
; CHECK-RV64-NEXT: andi a1, a0, 63
-; CHECK-RV64-NEXT: negw a0, a0
+; CHECK-RV64-NEXT: neg a0, a0
; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma
; CHECK-RV64-NEXT: vsrl.vx v12, v8, a1
; CHECK-RV64-NEXT: andi a0, a0, 63
@@ -1932,7 +1932,7 @@ define <vscale x 8 x i64> @vror_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
; CHECK-RV64-LABEL: vror_vx_nxv8i64:
; CHECK-RV64: # %bb.0:
; CHECK-RV64-NEXT: andi a1, a0, 63
-; CHECK-RV64-NEXT: negw a0, a0
+; CHECK-RV64-NEXT: neg a0, a0
; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; CHECK-RV64-NEXT: vsrl.vx v16, v8, a1
; CHECK-RV64-NEXT: andi a0, a0, 63
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
index 8eef133..4442f97 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
@@ -77,7 +77,7 @@ define i64 @con1024_minus_rem() {
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
-; CHECK-NEXT: negw a0, a0
+; CHECK-NEXT: neg a0, a0
; CHECK-NEXT: andi a0, a0, 1024
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm.mir b/llvm/test/CodeGen/RISCV/rvv/vxrm.mir
index 2bac1ee..87787c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm.mir
@@ -13,7 +13,7 @@ body: |
; MIR-NEXT: {{ $}}
; MIR-NEXT: WriteVXRMImm 0, implicit-def $vxrm
; MIR-NEXT: dead $x0 = PseudoVSETVLI killed renamable $x10, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype
- ; MIR-NEXT: renamable $v8 = PseudoVAADD_VV_MF8 undef $v8, killed renamable $v8, killed renamable $v9, 0, $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vxrm, implicit $vl, implicit $vtype
+ ; MIR-NEXT: renamable $v8 = PseudoVAADD_VV_MF8 undef renamable $v8, killed renamable $v8, killed renamable $v9, 0, $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vxrm, implicit $vl, implicit $vtype
; MIR-NEXT: PseudoRET implicit $v8
; ASM-LABEL: verify_vxrm:
; ASM: # %bb.0:
@@ -24,6 +24,7 @@ body: |
%0:vr = COPY $v8
%1:vr = COPY $v9
%2:gprnox0 = COPY $x10
- renamable $v8 = PseudoVAADD_VV_MF8 undef $noreg, %0, %1, 0, %2, 3 /* e8 */, 0
+ %3:vr = PseudoVAADD_VV_MF8 undef $noreg, %0, %1, 0, %2, 3 /* e8 */, 0
+ $v8 = COPY %3
PseudoRET implicit $v8
...
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index 0ea80bf..2e1784d 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -647,7 +647,7 @@ define i32 @select_add_1(i1 zeroext %cond, i32 %a, i32 %b) {
;
; RV64IM-LABEL: select_add_1:
; RV64IM: # %bb.0: # %entry
-; RV64IM-NEXT: negw a0, a0
+; RV64IM-NEXT: neg a0, a0
; RV64IM-NEXT: and a0, a0, a1
; RV64IM-NEXT: addw a0, a2, a0
; RV64IM-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
index b128abb..b155fea 100644
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -1048,21 +1048,21 @@ define signext i32 @bug(i32 signext %x) {
; CHECK-NEXT: srliw a2, a0, 24
; CHECK-NEXT: seqz a2, a2
; CHECK-NEXT: slli a3, a2, 3
-; CHECK-NEXT: negw a2, a2
+; CHECK-NEXT: neg a2, a2
; CHECK-NEXT: sllw a0, a0, a3
; CHECK-NEXT: andi a2, a2, -8
; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: srliw a2, a0, 28
; CHECK-NEXT: seqz a2, a2
; CHECK-NEXT: slli a3, a2, 2
-; CHECK-NEXT: negw a2, a2
+; CHECK-NEXT: neg a2, a2
; CHECK-NEXT: sllw a0, a0, a3
; CHECK-NEXT: andi a2, a2, -4
; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: srliw a2, a0, 30
; CHECK-NEXT: seqz a2, a2
; CHECK-NEXT: slli a3, a2, 1
-; CHECK-NEXT: negw a2, a2
+; CHECK-NEXT: neg a2, a2
; CHECK-NEXT: sllw a0, a0, a3
; CHECK-NEXT: andi a2, a2, -2
; CHECK-NEXT: add a1, a1, a2
@@ -1090,21 +1090,21 @@ define signext i32 @bug(i32 signext %x) {
; NOREMOVAL-NEXT: srliw a2, a0, 24
; NOREMOVAL-NEXT: seqz a2, a2
; NOREMOVAL-NEXT: slli a3, a2, 3
-; NOREMOVAL-NEXT: negw a2, a2
+; NOREMOVAL-NEXT: neg a2, a2
; NOREMOVAL-NEXT: sllw a0, a0, a3
; NOREMOVAL-NEXT: andi a2, a2, -8
; NOREMOVAL-NEXT: add a1, a1, a2
; NOREMOVAL-NEXT: srliw a2, a0, 28
; NOREMOVAL-NEXT: seqz a2, a2
; NOREMOVAL-NEXT: slli a3, a2, 2
-; NOREMOVAL-NEXT: negw a2, a2
+; NOREMOVAL-NEXT: neg a2, a2
; NOREMOVAL-NEXT: sllw a0, a0, a3
; NOREMOVAL-NEXT: andi a2, a2, -4
; NOREMOVAL-NEXT: add a1, a1, a2
; NOREMOVAL-NEXT: srliw a2, a0, 30
; NOREMOVAL-NEXT: seqz a2, a2
; NOREMOVAL-NEXT: slli a3, a2, 1
-; NOREMOVAL-NEXT: negw a2, a2
+; NOREMOVAL-NEXT: neg a2, a2
; NOREMOVAL-NEXT: sllw a0, a0, a3
; NOREMOVAL-NEXT: andi a2, a2, -2
; NOREMOVAL-NEXT: add a1, a1, a2
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 7ca1ee1..1ca23d7 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -383,7 +383,7 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
; RV64I-LABEL: fshr64_minsize:
; RV64I: # %bb.0:
; RV64I-NEXT: srl a2, a0, a1
-; RV64I-NEXT: negw a1, a1
+; RV64I-NEXT: neg a1, a1
; RV64I-NEXT: sll a0, a0, a1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/shl-cttz.ll b/llvm/test/CodeGen/RISCV/shl-cttz.ll
index 99dc4f8..e44d247 100644
--- a/llvm/test/CodeGen/RISCV/shl-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/shl-cttz.ll
@@ -40,7 +40,7 @@ define i8 @shl_cttz_i8(i8 %x, i8 %y) {
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: srli a2, a1, 1
; RV64I-NEXT: andi a2, a2, 85
-; RV64I-NEXT: subw a1, a1, a2
+; RV64I-NEXT: sub a1, a1, a2
; RV64I-NEXT: andi a2, a1, 51
; RV64I-NEXT: srli a1, a1, 2
; RV64I-NEXT: andi a1, a1, 51
@@ -96,7 +96,7 @@ define i8 @shl_cttz_constant_i8(i8 %y) {
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: srli a1, a0, 1
; RV64I-NEXT: andi a1, a1, 85
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: andi a1, a0, 51
; RV64I-NEXT: srli a0, a0, 2
; RV64I-NEXT: andi a0, a0, 51
@@ -276,7 +276,7 @@ define i32 @shl_cttz_i32(i32 %x, i32 %y) {
;
; RV64I-LABEL: shl_cttz_i32:
; RV64I: # %bb.0: # %entry
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 30667
; RV64I-NEXT: addi a2, a2, 1329
@@ -333,7 +333,7 @@ define i32 @shl_cttz_i32_zero_is_defined(i32 %x, i32 %y) {
; RV64I-NEXT: sext.w a2, a1
; RV64I-NEXT: beqz a2, .LBB5_2
; RV64I-NEXT: # %bb.1: # %cond.false
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 30667
; RV64I-NEXT: addi a2, a2, 1329
@@ -378,7 +378,7 @@ define i32 @shl_cttz_constant_i32(i32 %y) {
;
; RV64I-LABEL: shl_cttz_constant_i32:
; RV64I: # %bb.0: # %entry
-; RV64I-NEXT: negw a1, a0
+; RV64I-NEXT: neg a1, a0
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: lui a1, 30667
; RV64I-NEXT: addi a1, a1, 1329
@@ -474,7 +474,7 @@ define i32 @shl_cttz_multiuse_i32(i32 %x, i32 %y) {
; RV64I-NEXT: .cfi_offset ra, -8
; RV64I-NEXT: .cfi_offset s0, -16
; RV64I-NEXT: .cfi_offset s1, -24
-; RV64I-NEXT: negw a2, a1
+; RV64I-NEXT: neg a2, a1
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 30667
; RV64I-NEXT: addi a2, a2, 1329
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 93fb230..bc23388 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -50,7 +50,7 @@ define i1 @test_srem_odd(i29 %X) nounwind {
; RV64-NEXT: add a2, a2, a4
; RV64-NEXT: slli a4, a0, 2
; RV64-NEXT: add a4, a0, a4
-; RV64-NEXT: subw a1, a1, a4
+; RV64-NEXT: sub a1, a1, a4
; RV64-NEXT: slli a4, a0, 17
; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: slli a0, a0, 23
@@ -59,8 +59,8 @@ define i1 @test_srem_odd(i29 %X) nounwind {
; RV64-NEXT: add a1, a1, a3
; RV64-NEXT: lui a3, 1324
; RV64-NEXT: addi a2, a2, -83
-; RV64-NEXT: subw a0, a0, a2
-; RV64-NEXT: subw a1, a1, a0
+; RV64-NEXT: sub a0, a0, a2
+; RV64-NEXT: sub a1, a1, a0
; RV64-NEXT: slli a1, a1, 35
; RV64-NEXT: srli a1, a1, 35
; RV64-NEXT: addi a0, a3, -165
@@ -189,7 +189,7 @@ define i1 @test_srem_even(i4 %X) nounwind {
; RV64M-NEXT: add a1, a1, a2
; RV64M-NEXT: slli a2, a1, 3
; RV64M-NEXT: slli a1, a1, 1
-; RV64M-NEXT: subw a1, a1, a2
+; RV64M-NEXT: sub a1, a1, a2
; RV64M-NEXT: add a0, a0, a1
; RV64M-NEXT: andi a0, a0, 15
; RV64M-NEXT: addi a0, a0, -1
@@ -225,7 +225,7 @@ define i1 @test_srem_even(i4 %X) nounwind {
; RV64MV-NEXT: add a1, a1, a2
; RV64MV-NEXT: slli a2, a1, 3
; RV64MV-NEXT: slli a1, a1, 1
-; RV64MV-NEXT: subw a1, a1, a2
+; RV64MV-NEXT: sub a1, a1, a2
; RV64MV-NEXT: add a0, a0, a1
; RV64MV-NEXT: andi a0, a0, 15
; RV64MV-NEXT: addi a0, a0, -1
@@ -256,7 +256,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; RV64-NEXT: srli a1, a1, 62
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: andi a1, a1, 60
-; RV64-NEXT: subw a0, a0, a1
+; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: andi a0, a0, 63
; RV64-NEXT: snez a0, a0
; RV64-NEXT: ret
@@ -280,7 +280,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; RV64M-NEXT: srli a1, a1, 62
; RV64M-NEXT: add a1, a0, a1
; RV64M-NEXT: andi a1, a1, 60
-; RV64M-NEXT: subw a0, a0, a1
+; RV64M-NEXT: sub a0, a0, a1
; RV64M-NEXT: andi a0, a0, 63
; RV64M-NEXT: snez a0, a0
; RV64M-NEXT: ret
@@ -304,7 +304,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; RV64MV-NEXT: srli a1, a1, 62
; RV64MV-NEXT: add a1, a0, a1
; RV64MV-NEXT: andi a1, a1, 60
-; RV64MV-NEXT: subw a0, a0, a1
+; RV64MV-NEXT: sub a0, a0, a1
; RV64MV-NEXT: andi a0, a0, 63
; RV64MV-NEXT: snez a0, a0
; RV64MV-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index 30ffaf6..5129ccc 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -183,10 +183,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
; RV64IM-NEXT: mul a5, a5, t1
; RV64IM-NEXT: li t1, -124
; RV64IM-NEXT: mul a6, a6, t1
-; RV64IM-NEXT: subw a4, a4, a7
-; RV64IM-NEXT: subw a1, a1, t0
-; RV64IM-NEXT: subw a3, a3, a5
-; RV64IM-NEXT: subw a2, a2, a6
+; RV64IM-NEXT: sub a4, a4, a7
+; RV64IM-NEXT: sub a1, a1, t0
+; RV64IM-NEXT: sub a3, a3, a5
+; RV64IM-NEXT: sub a2, a2, a6
; RV64IM-NEXT: sh a3, 0(a0)
; RV64IM-NEXT: sh a2, 2(a0)
; RV64IM-NEXT: sh a4, 4(a0)
@@ -357,10 +357,10 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
; RV64IM-NEXT: mul a7, a7, t1
; RV64IM-NEXT: mul t0, t0, t1
; RV64IM-NEXT: mul a2, a2, t1
-; RV64IM-NEXT: subw a3, a3, a6
-; RV64IM-NEXT: subw a4, a4, a7
-; RV64IM-NEXT: subw a5, a5, t0
-; RV64IM-NEXT: subw a1, a1, a2
+; RV64IM-NEXT: sub a3, a3, a6
+; RV64IM-NEXT: sub a4, a4, a7
+; RV64IM-NEXT: sub a5, a5, t0
+; RV64IM-NEXT: sub a1, a1, a2
; RV64IM-NEXT: sh a3, 0(a0)
; RV64IM-NEXT: sh a4, 2(a0)
; RV64IM-NEXT: sh a5, 4(a0)
@@ -597,10 +597,10 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
; RV64IM-NEXT: add a1, a1, t1
; RV64IM-NEXT: add a3, a3, t0
; RV64IM-NEXT: add a4, a4, a7
-; RV64IM-NEXT: subw a2, a2, a6
-; RV64IM-NEXT: subw a1, a1, t4
-; RV64IM-NEXT: subw a3, a3, t3
-; RV64IM-NEXT: subw a4, a4, t2
+; RV64IM-NEXT: sub a2, a2, a6
+; RV64IM-NEXT: sub a1, a1, t4
+; RV64IM-NEXT: sub a3, a3, t3
+; RV64IM-NEXT: sub a4, a4, t2
; RV64IM-NEXT: sh a2, 0(a0)
; RV64IM-NEXT: sh a1, 2(a0)
; RV64IM-NEXT: sh a3, 4(a0)
@@ -703,15 +703,15 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
; RV64I-NEXT: srli a1, a2, 58
; RV64I-NEXT: add a1, a2, a1
; RV64I-NEXT: andi a1, a1, -64
-; RV64I-NEXT: subw s1, a2, a1
+; RV64I-NEXT: sub s1, a2, a1
; RV64I-NEXT: srli a1, a3, 59
; RV64I-NEXT: add a1, a3, a1
; RV64I-NEXT: andi a1, a1, -32
-; RV64I-NEXT: subw s2, a3, a1
+; RV64I-NEXT: sub s2, a3, a1
; RV64I-NEXT: srli a1, a4, 61
; RV64I-NEXT: add a1, a4, a1
; RV64I-NEXT: andi a1, a1, -8
-; RV64I-NEXT: subw s3, a4, a1
+; RV64I-NEXT: sub s3, a4, a1
; RV64I-NEXT: li a1, 95
; RV64I-NEXT: call __moddi3
; RV64I-NEXT: sh s1, 0(s0)
@@ -737,23 +737,23 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
; RV64IM-NEXT: srli a6, a2, 58
; RV64IM-NEXT: add a6, a2, a6
; RV64IM-NEXT: andi a6, a6, -64
-; RV64IM-NEXT: subw a2, a2, a6
+; RV64IM-NEXT: sub a2, a2, a6
; RV64IM-NEXT: srli a6, a3, 59
; RV64IM-NEXT: add a6, a3, a6
; RV64IM-NEXT: andi a6, a6, -32
-; RV64IM-NEXT: subw a3, a3, a6
+; RV64IM-NEXT: sub a3, a3, a6
; RV64IM-NEXT: srli a6, a4, 61
; RV64IM-NEXT: mulh a5, a1, a5
; RV64IM-NEXT: add a6, a4, a6
; RV64IM-NEXT: add a5, a5, a1
; RV64IM-NEXT: andi a6, a6, -8
-; RV64IM-NEXT: subw a4, a4, a6
+; RV64IM-NEXT: sub a4, a4, a6
; RV64IM-NEXT: srli a6, a5, 63
; RV64IM-NEXT: srli a5, a5, 6
; RV64IM-NEXT: add a5, a5, a6
; RV64IM-NEXT: li a6, 95
; RV64IM-NEXT: mul a5, a5, a6
-; RV64IM-NEXT: subw a1, a1, a5
+; RV64IM-NEXT: sub a1, a1, a5
; RV64IM-NEXT: sh a2, 0(a0)
; RV64IM-NEXT: sh a3, 2(a0)
; RV64IM-NEXT: sh a4, 4(a0)
@@ -909,9 +909,9 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
; RV64IM-NEXT: mul a6, a6, a7
; RV64IM-NEXT: li a7, 23
; RV64IM-NEXT: mul a4, a4, a7
-; RV64IM-NEXT: subw a2, a2, a5
-; RV64IM-NEXT: subw a1, a1, a6
-; RV64IM-NEXT: subw a3, a3, a4
+; RV64IM-NEXT: sub a2, a2, a5
+; RV64IM-NEXT: sub a1, a1, a6
+; RV64IM-NEXT: sub a3, a3, a4
; RV64IM-NEXT: sh zero, 0(a0)
; RV64IM-NEXT: sh a2, 2(a0)
; RV64IM-NEXT: sh a3, 4(a0)
@@ -1011,7 +1011,7 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
; RV64I-NEXT: add a1, a2, a1
; RV64I-NEXT: lui a3, 8
; RV64I-NEXT: and a1, a1, a3
-; RV64I-NEXT: subw s3, a2, a1
+; RV64I-NEXT: sub s3, a2, a1
; RV64I-NEXT: li a1, 23
; RV64I-NEXT: call __moddi3
; RV64I-NEXT: mv s2, a0
@@ -1050,7 +1050,7 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
; RV64IM-NEXT: add a5, a5, a7
; RV64IM-NEXT: mulh a4, a3, a4
; RV64IM-NEXT: add a4, a4, a3
-; RV64IM-NEXT: subw a2, a2, a6
+; RV64IM-NEXT: sub a2, a2, a6
; RV64IM-NEXT: srli a6, a4, 63
; RV64IM-NEXT: srli a4, a4, 4
; RV64IM-NEXT: add a4, a4, a6
@@ -1059,8 +1059,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
; RV64IM-NEXT: mul a5, a5, a6
; RV64IM-NEXT: li a6, 23
; RV64IM-NEXT: mul a4, a4, a6
-; RV64IM-NEXT: subw a1, a1, a5
-; RV64IM-NEXT: subw a3, a3, a4
+; RV64IM-NEXT: sub a1, a1, a5
+; RV64IM-NEXT: sub a3, a3, a4
; RV64IM-NEXT: sh zero, 0(a0)
; RV64IM-NEXT: sh a2, 2(a0)
; RV64IM-NEXT: sh a3, 4(a0)
diff --git a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
index 3007c35..0c13a1d 100644
--- a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
@@ -26,7 +26,7 @@ define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) {
define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) {
; CHECK-LABEL: overflow_sub:
; CHECK: # %bb.0:
-; CHECK-NEXT: subw a0, a0, a1
+; CHECK-NEXT: sub a0, a0, a1
; CHECK-NEXT: ori a0, a0, 1
; CHECK-NEXT: slli a0, a0, 48
; CHECK-NEXT: srli a0, a0, 48
diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll
index af5121d..ee49612 100644
--- a/llvm/test/CodeGen/RISCV/urem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll
@@ -48,7 +48,7 @@ define i32 @fold_urem_positive_odd(i32 %x) nounwind {
; RV64IM-NEXT: slli a2, a2, 32
; RV64IM-NEXT: mulhu a1, a1, a2
; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: subw a2, a0, a1
+; RV64IM-NEXT: sub a2, a0, a1
; RV64IM-NEXT: srliw a2, a2, 1
; RV64IM-NEXT: add a1, a2, a1
; RV64IM-NEXT: srli a1, a1, 6
@@ -174,7 +174,7 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
; RV64IM-NEXT: slli a2, a2, 32
; RV64IM-NEXT: mulhu a1, a1, a2
; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: subw a2, a0, a1
+; RV64IM-NEXT: sub a2, a0, a1
; RV64IM-NEXT: srliw a2, a2, 1
; RV64IM-NEXT: add a1, a2, a1
; RV64IM-NEXT: li a2, 95
diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
index d33c666..636fdfa 100644
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -31,11 +31,11 @@ define i1 @test_urem_odd(i13 %X) nounwind {
; RV64-NEXT: slli a1, a0, 4
; RV64-NEXT: slli a2, a0, 6
; RV64-NEXT: slli a3, a0, 8
-; RV64-NEXT: subw a1, a1, a2
+; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: slli a2, a0, 10
-; RV64-NEXT: subw a3, a3, a2
+; RV64-NEXT: sub a3, a3, a2
; RV64-NEXT: slli a2, a0, 2
-; RV64-NEXT: subw a2, a0, a2
+; RV64-NEXT: sub a2, a0, a2
; RV64-NEXT: slli a0, a0, 12
; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: add a0, a3, a0
@@ -138,10 +138,10 @@ define i1 @test_urem_even(i27 %X) nounwind {
; RV64-NEXT: slli a4, a0, 18
; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: slli a0, a0, 27
-; RV64-NEXT: subw a0, a0, a2
+; RV64-NEXT: sub a0, a0, a2
; RV64-NEXT: lui a2, 2341
; RV64-NEXT: add a1, a1, a3
-; RV64-NEXT: subw a0, a0, a1
+; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: slli a1, a0, 26
; RV64-NEXT: slli a0, a0, 37
; RV64-NEXT: srli a0, a0, 38
@@ -234,8 +234,8 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
; RV64-LABEL: test_urem_odd_setne:
; RV64: # %bb.0:
; RV64-NEXT: slli a1, a0, 1
-; RV64-NEXT: negw a0, a0
-; RV64-NEXT: subw a0, a0, a1
+; RV64-NEXT: neg a0, a0
+; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: andi a0, a0, 15
; RV64-NEXT: sltiu a0, a0, 4
; RV64-NEXT: xori a0, a0, 1
@@ -254,8 +254,8 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
; RV64M-LABEL: test_urem_odd_setne:
; RV64M: # %bb.0:
; RV64M-NEXT: slli a1, a0, 1
-; RV64M-NEXT: negw a0, a0
-; RV64M-NEXT: subw a0, a0, a1
+; RV64M-NEXT: neg a0, a0
+; RV64M-NEXT: sub a0, a0, a1
; RV64M-NEXT: andi a0, a0, 15
; RV64M-NEXT: sltiu a0, a0, 4
; RV64M-NEXT: xori a0, a0, 1
@@ -274,8 +274,8 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
; RV64MV-LABEL: test_urem_odd_setne:
; RV64MV: # %bb.0:
; RV64MV-NEXT: slli a1, a0, 1
-; RV64MV-NEXT: negw a0, a0
-; RV64MV-NEXT: subw a0, a0, a1
+; RV64MV-NEXT: neg a0, a0
+; RV64MV-NEXT: sub a0, a0, a1
; RV64MV-NEXT: andi a0, a0, 15
; RV64MV-NEXT: sltiu a0, a0, 4
; RV64MV-NEXT: xori a0, a0, 1
@@ -306,9 +306,9 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
; RV64-NEXT: slli a1, a0, 2
; RV64-NEXT: slli a2, a0, 4
; RV64-NEXT: slli a3, a0, 6
-; RV64-NEXT: subw a1, a1, a0
-; RV64-NEXT: subw a2, a2, a3
-; RV64-NEXT: subw a1, a1, a2
+; RV64-NEXT: sub a1, a1, a0
+; RV64-NEXT: sub a2, a2, a3
+; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: slli a0, a0, 8
; RV64-NEXT: add a0, a1, a0
; RV64-NEXT: andi a0, a0, 511
@@ -437,7 +437,7 @@ define void @test_urem_vec(ptr %X) nounwind {
; RV64-NEXT: addi a2, a2, -2
; RV64-NEXT: add a1, a1, a4
; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: subw a4, t0, a7
+; RV64-NEXT: sub a4, t0, a7
; RV64-NEXT: slli a6, a3, 3
; RV64-NEXT: slli a7, a3, 6
; RV64-NEXT: slli t0, a3, 9
@@ -447,18 +447,18 @@ define void @test_urem_vec(ptr %X) nounwind {
; RV64-NEXT: slli a6, a2, 4
; RV64-NEXT: add a7, a7, t0
; RV64-NEXT: slli t0, a2, 6
-; RV64-NEXT: subw a6, a6, t0
+; RV64-NEXT: sub a6, a6, t0
; RV64-NEXT: slli t0, a2, 8
-; RV64-NEXT: subw a5, a5, a2
+; RV64-NEXT: sub a5, a5, a2
; RV64-NEXT: slli a2, a2, 10
-; RV64-NEXT: subw a2, t0, a2
-; RV64-NEXT: subw a4, a4, a1
+; RV64-NEXT: sub a2, t0, a2
+; RV64-NEXT: sub a4, a4, a1
; RV64-NEXT: add a3, a3, a7
-; RV64-NEXT: subw a1, a5, a6
+; RV64-NEXT: sub a1, a5, a6
; RV64-NEXT: slli a5, a4, 10
; RV64-NEXT: slli a4, a4, 53
-; RV64-NEXT: negw a3, a3
-; RV64-NEXT: subw a1, a1, a2
+; RV64-NEXT: neg a3, a3
+; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: srli a4, a4, 54
; RV64-NEXT: andi a2, a3, 2047
; RV64-NEXT: andi a1, a1, 2047
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index 3ef9f3f..5a3dfd1 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -157,10 +157,10 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
; RV64IM-NEXT: mul a7, a7, t1
; RV64IM-NEXT: slli t1, a5, 7
; RV64IM-NEXT: slli a5, a5, 2
-; RV64IM-NEXT: subw a5, a5, t1
-; RV64IM-NEXT: subw a2, a2, a6
-; RV64IM-NEXT: subw a4, a4, t0
-; RV64IM-NEXT: subw a1, a1, a7
+; RV64IM-NEXT: sub a5, a5, t1
+; RV64IM-NEXT: sub a2, a2, a6
+; RV64IM-NEXT: sub a4, a4, t0
+; RV64IM-NEXT: sub a1, a1, a7
; RV64IM-NEXT: add a3, a3, a5
; RV64IM-NEXT: sh a2, 0(a0)
; RV64IM-NEXT: sh a3, 2(a0)
@@ -300,10 +300,10 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
; RV64IM-NEXT: mul t0, t0, a6
; RV64IM-NEXT: mul t1, t1, a6
; RV64IM-NEXT: mul a2, a2, a6
-; RV64IM-NEXT: subw a3, a3, a7
-; RV64IM-NEXT: subw a4, a4, t0
-; RV64IM-NEXT: subw a5, a5, t1
-; RV64IM-NEXT: subw a1, a1, a2
+; RV64IM-NEXT: sub a3, a3, a7
+; RV64IM-NEXT: sub a4, a4, t0
+; RV64IM-NEXT: sub a5, a5, t1
+; RV64IM-NEXT: sub a1, a1, a2
; RV64IM-NEXT: sh a3, 0(a0)
; RV64IM-NEXT: sh a4, 2(a0)
; RV64IM-NEXT: sh a5, 4(a0)
@@ -508,10 +508,10 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
; RV64IM-NEXT: add a1, a1, t1
; RV64IM-NEXT: add a3, a3, t0
; RV64IM-NEXT: add a4, a4, a7
-; RV64IM-NEXT: subw a2, a2, a6
-; RV64IM-NEXT: subw a1, a1, t4
-; RV64IM-NEXT: subw a3, a3, t3
-; RV64IM-NEXT: subw a4, a4, t2
+; RV64IM-NEXT: sub a2, a2, a6
+; RV64IM-NEXT: sub a1, a1, t4
+; RV64IM-NEXT: sub a3, a3, t3
+; RV64IM-NEXT: sub a4, a4, t2
; RV64IM-NEXT: sh a2, 0(a0)
; RV64IM-NEXT: sh a1, 2(a0)
; RV64IM-NEXT: sh a3, 4(a0)
@@ -622,7 +622,7 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
; RV64IM-NEXT: andi a4, a4, 7
; RV64IM-NEXT: mulhu a5, a1, a5
; RV64IM-NEXT: mul a5, a5, a6
-; RV64IM-NEXT: subw a1, a1, a5
+; RV64IM-NEXT: sub a1, a1, a5
; RV64IM-NEXT: sh a2, 0(a0)
; RV64IM-NEXT: sh a3, 2(a0)
; RV64IM-NEXT: sh a4, 4(a0)
@@ -757,9 +757,9 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
; RV64IM-NEXT: addi a7, a7, 1327
; RV64IM-NEXT: mulhu a5, a1, a5
; RV64IM-NEXT: mul a5, a5, a7
-; RV64IM-NEXT: subw a2, a2, a4
-; RV64IM-NEXT: subw a3, a3, a6
-; RV64IM-NEXT: subw a1, a1, a5
+; RV64IM-NEXT: sub a2, a2, a4
+; RV64IM-NEXT: sub a3, a3, a6
+; RV64IM-NEXT: sub a1, a1, a5
; RV64IM-NEXT: sh zero, 0(a0)
; RV64IM-NEXT: sh a2, 2(a0)
; RV64IM-NEXT: sh a3, 4(a0)
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index 32753ca..cd7f30d 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -716,92 +716,101 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 9(a0)
+; RV32I-NEXT: lbu t3, 10(a0)
+; RV32I-NEXT: lbu t4, 11(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or a4, a4, a3
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a3, t0, a7
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 9(a0)
-; RV32I-NEXT: lbu t0, 10(a0)
-; RV32I-NEXT: lbu t3, 11(a0)
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t1, t2, t1
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a7, t3, t0
-; RV32I-NEXT: lbu t0, 12(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 14(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, t4, t3
+; RV32I-NEXT: or a6, t1, a6
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: lbu t2, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
; RV32I-NEXT: sw zero, 16(sp)
; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: or a1, t2, t0
-; RV32I-NEXT: mv t0, sp
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or t2, t4, t3
-; RV32I-NEXT: srli t3, a0, 3
-; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: andi a5, a0, 31
-; RV32I-NEXT: andi t3, t3, 12
-; RV32I-NEXT: xori a5, a5, 31
-; RV32I-NEXT: or a3, t1, a3
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a1, t2, a1
-; RV32I-NEXT: add t0, t0, t3
-; RV32I-NEXT: sw a4, 0(sp)
-; RV32I-NEXT: sw a3, 4(sp)
-; RV32I-NEXT: sw a6, 8(sp)
-; RV32I-NEXT: sw a1, 12(sp)
-; RV32I-NEXT: lw a1, 4(t0)
-; RV32I-NEXT: lw a3, 8(t0)
-; RV32I-NEXT: lw a4, 0(t0)
-; RV32I-NEXT: lw a6, 12(t0)
-; RV32I-NEXT: srl a7, a1, a0
-; RV32I-NEXT: slli t0, a3, 1
-; RV32I-NEXT: srl a4, a4, a0
-; RV32I-NEXT: slli a1, a1, 1
-; RV32I-NEXT: srl a3, a3, a0
-; RV32I-NEXT: slli t1, a6, 1
-; RV32I-NEXT: srl a0, a6, a0
-; RV32I-NEXT: sll a6, t0, a5
-; RV32I-NEXT: sll a1, a1, a5
-; RV32I-NEXT: sll a5, t1, a5
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: mv t2, sp
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, t0, a7
+; RV32I-NEXT: or a5, t3, a5
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a3, a1, 31
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: xori a3, a3, 31
+; RV32I-NEXT: add a0, t2, a0
+; RV32I-NEXT: lw a4, 4(a0)
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: lw a6, 0(a0)
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: srl a7, a4, a1
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: srl a6, a6, a1
+; RV32I-NEXT: slli a4, a4, 1
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli t1, a0, 1
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: sll a1, t0, a3
+; RV32I-NEXT: sll a4, a4, a3
+; RV32I-NEXT: sll a3, t1, a3
; RV32I-NEXT: srli t0, a0, 16
; RV32I-NEXT: srli t1, a0, 24
; RV32I-NEXT: srli t2, a0, 8
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a1, a4, a1
-; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: or a1, a7, a1
+; RV32I-NEXT: or a4, a6, a4
+; RV32I-NEXT: or a3, a5, a3
; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb t2, 13(a2)
; RV32I-NEXT: sb t0, 14(a2)
; RV32I-NEXT: sb t1, 15(a2)
; RV32I-NEXT: srli a0, a3, 16
-; RV32I-NEXT: srli a4, a3, 24
-; RV32I-NEXT: srli a5, a3, 8
-; RV32I-NEXT: srli a7, a1, 16
-; RV32I-NEXT: srli t0, a1, 24
-; RV32I-NEXT: srli t1, a1, 8
-; RV32I-NEXT: srli t2, a6, 16
-; RV32I-NEXT: srli t3, a6, 24
+; RV32I-NEXT: srli a5, a3, 24
+; RV32I-NEXT: srli a6, a3, 8
+; RV32I-NEXT: srli a7, a4, 16
+; RV32I-NEXT: srli t0, a4, 24
+; RV32I-NEXT: srli t1, a4, 8
+; RV32I-NEXT: srli t2, a1, 16
+; RV32I-NEXT: srli t3, a1, 24
; RV32I-NEXT: sb a3, 8(a2)
-; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: sb a6, 9(a2)
; RV32I-NEXT: sb a0, 10(a2)
-; RV32I-NEXT: sb a4, 11(a2)
-; RV32I-NEXT: srli a0, a6, 8
-; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a5, 11(a2)
+; RV32I-NEXT: srli a0, a1, 8
+; RV32I-NEXT: sb a4, 0(a2)
; RV32I-NEXT: sb t1, 1(a2)
; RV32I-NEXT: sb a7, 2(a2)
; RV32I-NEXT: sb t0, 3(a2)
-; RV32I-NEXT: sb a6, 4(a2)
+; RV32I-NEXT: sb a1, 4(a2)
; RV32I-NEXT: sb a0, 5(a2)
; RV32I-NEXT: sb t2, 6(a2)
; RV32I-NEXT: sb t3, 7(a2)
@@ -943,93 +952,102 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 9(a0)
+; RV32I-NEXT: lbu t3, 10(a0)
+; RV32I-NEXT: lbu t4, 11(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or a4, a4, a3
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a3, t0, a7
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 9(a0)
-; RV32I-NEXT: lbu t0, 10(a0)
-; RV32I-NEXT: lbu t3, 11(a0)
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t1, t2, t1
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a7, t3, t0
-; RV32I-NEXT: lbu t0, 12(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 14(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, t4, t3
+; RV32I-NEXT: or a6, t1, a6
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: lbu t2, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
; RV32I-NEXT: sw zero, 0(sp)
; RV32I-NEXT: sw zero, 4(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: or a1, t2, t0
-; RV32I-NEXT: addi t0, sp, 16
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or t2, t4, t3
-; RV32I-NEXT: srli t3, a0, 3
-; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: andi a5, a0, 31
-; RV32I-NEXT: andi t3, t3, 12
-; RV32I-NEXT: or a3, t1, a3
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a1, t2, a1
-; RV32I-NEXT: sub a7, t0, t3
-; RV32I-NEXT: sw a4, 16(sp)
-; RV32I-NEXT: sw a3, 20(sp)
-; RV32I-NEXT: sw a6, 24(sp)
-; RV32I-NEXT: sw a1, 28(sp)
-; RV32I-NEXT: lw a1, 0(a7)
-; RV32I-NEXT: lw a3, 4(a7)
-; RV32I-NEXT: lw a4, 8(a7)
-; RV32I-NEXT: lw a6, 12(a7)
-; RV32I-NEXT: xori a5, a5, 31
-; RV32I-NEXT: sll a7, a3, a0
-; RV32I-NEXT: srli t0, a1, 1
-; RV32I-NEXT: sll a6, a6, a0
-; RV32I-NEXT: srli t1, a4, 1
-; RV32I-NEXT: sll a4, a4, a0
-; RV32I-NEXT: srli a3, a3, 1
-; RV32I-NEXT: sll a0, a1, a0
-; RV32I-NEXT: srl a1, t0, a5
-; RV32I-NEXT: srl t0, t1, a5
-; RV32I-NEXT: srl a3, a3, a5
-; RV32I-NEXT: srli a5, a0, 16
-; RV32I-NEXT: srli t1, a0, 24
-; RV32I-NEXT: srli t2, a0, 8
-; RV32I-NEXT: or a1, a7, a1
-; RV32I-NEXT: or a6, a6, t0
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: addi t2, sp, 16
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: sb a0, 0(a2)
+; RV32I-NEXT: or a4, t0, a7
+; RV32I-NEXT: or a5, t3, a5
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a3, a1, 31
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: sub a0, t2, a0
+; RV32I-NEXT: lw a4, 0(a0)
+; RV32I-NEXT: lw a5, 4(a0)
+; RV32I-NEXT: lw a6, 8(a0)
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: xori a3, a3, 31
+; RV32I-NEXT: sll a7, a5, a1
+; RV32I-NEXT: srli t0, a4, 1
+; RV32I-NEXT: sll a0, a0, a1
+; RV32I-NEXT: srli t1, a6, 1
+; RV32I-NEXT: sll a6, a6, a1
+; RV32I-NEXT: srli a5, a5, 1
+; RV32I-NEXT: sll a1, a4, a1
+; RV32I-NEXT: srl a4, t0, a3
+; RV32I-NEXT: srl t0, t1, a3
+; RV32I-NEXT: srl a3, a5, a3
+; RV32I-NEXT: srli a5, a1, 16
+; RV32I-NEXT: srli t1, a1, 24
+; RV32I-NEXT: srli t2, a1, 8
+; RV32I-NEXT: or a4, a7, a4
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: sb a1, 0(a2)
; RV32I-NEXT: sb t2, 1(a2)
; RV32I-NEXT: sb a5, 2(a2)
; RV32I-NEXT: sb t1, 3(a2)
-; RV32I-NEXT: srli a0, a3, 16
-; RV32I-NEXT: srli a4, a3, 24
-; RV32I-NEXT: srli a5, a3, 8
-; RV32I-NEXT: srli a7, a6, 16
-; RV32I-NEXT: srli t0, a6, 24
-; RV32I-NEXT: srli t1, a6, 8
-; RV32I-NEXT: srli t2, a1, 16
-; RV32I-NEXT: srli t3, a1, 24
+; RV32I-NEXT: srli a1, a3, 16
+; RV32I-NEXT: srli a5, a3, 24
+; RV32I-NEXT: srli a6, a3, 8
+; RV32I-NEXT: srli a7, a0, 16
+; RV32I-NEXT: srli t0, a0, 24
+; RV32I-NEXT: srli t1, a0, 8
+; RV32I-NEXT: srli t2, a4, 16
+; RV32I-NEXT: srli t3, a4, 24
; RV32I-NEXT: sb a3, 8(a2)
-; RV32I-NEXT: sb a5, 9(a2)
-; RV32I-NEXT: sb a0, 10(a2)
-; RV32I-NEXT: sb a4, 11(a2)
-; RV32I-NEXT: srli a0, a1, 8
-; RV32I-NEXT: sb a6, 12(a2)
+; RV32I-NEXT: sb a6, 9(a2)
+; RV32I-NEXT: sb a1, 10(a2)
+; RV32I-NEXT: sb a5, 11(a2)
+; RV32I-NEXT: srli a1, a4, 8
+; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb t1, 13(a2)
; RV32I-NEXT: sb a7, 14(a2)
; RV32I-NEXT: sb t0, 15(a2)
-; RV32I-NEXT: sb a1, 4(a2)
-; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: sb a1, 5(a2)
; RV32I-NEXT: sb t2, 6(a2)
; RV32I-NEXT: sb t3, 7(a2)
; RV32I-NEXT: addi sp, sp, 32
@@ -1168,73 +1186,82 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu t1, 6(a0)
; RV32I-NEXT: lbu t2, 7(a0)
; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 8(a0)
+; RV32I-NEXT: lbu t3, 9(a0)
+; RV32I-NEXT: lbu t4, 10(a0)
+; RV32I-NEXT: lbu t5, 11(a0)
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 9(a0)
-; RV32I-NEXT: lbu t0, 10(a0)
-; RV32I-NEXT: lbu t3, 11(a0)
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t1, t2, t1
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a7, t3, t0
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
; RV32I-NEXT: lbu t0, 12(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 14(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: or a1, t2, t0
-; RV32I-NEXT: mv t0, sp
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: srli a4, a0, 3
-; RV32I-NEXT: or a5, t1, a5
-; RV32I-NEXT: andi t1, a0, 31
-; RV32I-NEXT: or t2, t4, t3
-; RV32I-NEXT: srai t3, t4, 31
-; RV32I-NEXT: andi a4, a4, 12
-; RV32I-NEXT: xori t1, t1, 31
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or a4, t3, a4
+; RV32I-NEXT: or t3, t5, t4
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: lbu t4, 0(a1)
+; RV32I-NEXT: lbu t5, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t4
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t5
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, sp
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t2, a0, t2
+; RV32I-NEXT: srai a0, a0, 31
; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a1, t2, a1
-; RV32I-NEXT: sw t3, 16(sp)
-; RV32I-NEXT: sw t3, 20(sp)
-; RV32I-NEXT: sw t3, 24(sp)
-; RV32I-NEXT: sw t3, 28(sp)
-; RV32I-NEXT: add a4, t0, a4
+; RV32I-NEXT: or a4, t3, a4
+; RV32I-NEXT: or a7, t2, t0
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 24(sp)
+; RV32I-NEXT: sw a0, 28(sp)
; RV32I-NEXT: sw a3, 0(sp)
-; RV32I-NEXT: sw a5, 4(sp)
-; RV32I-NEXT: sw a6, 8(sp)
-; RV32I-NEXT: sw a1, 12(sp)
-; RV32I-NEXT: lw a1, 4(a4)
-; RV32I-NEXT: lw a3, 8(a4)
-; RV32I-NEXT: lw a5, 0(a4)
-; RV32I-NEXT: lw a4, 12(a4)
-; RV32I-NEXT: srl a6, a1, a0
-; RV32I-NEXT: slli a7, a3, 1
-; RV32I-NEXT: srl a5, a5, a0
-; RV32I-NEXT: slli a1, a1, 1
-; RV32I-NEXT: srl a3, a3, a0
-; RV32I-NEXT: slli t0, a4, 1
-; RV32I-NEXT: sra a0, a4, a0
-; RV32I-NEXT: sll a4, a7, t1
-; RV32I-NEXT: sll a1, a1, t1
-; RV32I-NEXT: sll a7, t0, t1
+; RV32I-NEXT: sw a6, 4(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a7, 12(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a3, a1, 31
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: xori a3, a3, 31
+; RV32I-NEXT: add a0, a5, a0
+; RV32I-NEXT: lw a4, 4(a0)
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: lw a6, 0(a0)
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: srl a7, a4, a1
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: srl a6, a6, a1
+; RV32I-NEXT: slli a4, a4, 1
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli t1, a0, 1
+; RV32I-NEXT: sra a0, a0, a1
+; RV32I-NEXT: sll a1, t0, a3
+; RV32I-NEXT: sll a4, a4, a3
+; RV32I-NEXT: sll a3, t1, a3
; RV32I-NEXT: srli t0, a0, 16
; RV32I-NEXT: srli t1, a0, 24
; RV32I-NEXT: srli t2, a0, 8
+; RV32I-NEXT: or a1, a7, a1
; RV32I-NEXT: or a4, a6, a4
-; RV32I-NEXT: or a1, a5, a1
-; RV32I-NEXT: or a3, a3, a7
+; RV32I-NEXT: or a3, a5, a3
; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb t2, 13(a2)
; RV32I-NEXT: sb t0, 14(a2)
@@ -1242,21 +1269,21 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli a0, a3, 16
; RV32I-NEXT: srli a5, a3, 24
; RV32I-NEXT: srli a6, a3, 8
-; RV32I-NEXT: srli a7, a1, 16
-; RV32I-NEXT: srli t0, a1, 24
-; RV32I-NEXT: srli t1, a1, 8
-; RV32I-NEXT: srli t2, a4, 16
-; RV32I-NEXT: srli t3, a4, 24
+; RV32I-NEXT: srli a7, a4, 16
+; RV32I-NEXT: srli t0, a4, 24
+; RV32I-NEXT: srli t1, a4, 8
+; RV32I-NEXT: srli t2, a1, 16
+; RV32I-NEXT: srli t3, a1, 24
; RV32I-NEXT: sb a3, 8(a2)
; RV32I-NEXT: sb a6, 9(a2)
; RV32I-NEXT: sb a0, 10(a2)
; RV32I-NEXT: sb a5, 11(a2)
-; RV32I-NEXT: srli a0, a4, 8
-; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: srli a0, a1, 8
+; RV32I-NEXT: sb a4, 0(a2)
; RV32I-NEXT: sb t1, 1(a2)
; RV32I-NEXT: sb a7, 2(a2)
; RV32I-NEXT: sb t0, 3(a2)
-; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: sb a1, 4(a2)
; RV32I-NEXT: sb a0, 5(a2)
; RV32I-NEXT: sb t2, 6(a2)
; RV32I-NEXT: sb t3, 7(a2)
@@ -1272,17 +1299,19 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: lshr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -144
-; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
; RV64I-NEXT: lbu a3, 0(a0)
; RV64I-NEXT: lbu a4, 1(a0)
; RV64I-NEXT: lbu a5, 2(a0)
@@ -1299,122 +1328,143 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: lbu s1, 13(a0)
; RV64I-NEXT: lbu s2, 14(a0)
; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: lbu s4, 16(a0)
; RV64I-NEXT: lbu s5, 17(a0)
; RV64I-NEXT: lbu s6, 18(a0)
; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli s8, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a5, a4, a3
+; RV64I-NEXT: or a6, a6, s8
+; RV64I-NEXT: or a3, t0, a7
+; RV64I-NEXT: or a4, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
; RV64I-NEXT: slli t4, t4, 8
; RV64I-NEXT: slli t5, t5, 16
; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: lbu t5, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s8, 22(a0)
-; RV64I-NEXT: lbu s9, 23(a0)
; RV64I-NEXT: slli s1, s1, 8
; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t6, 24(a0)
+; RV64I-NEXT: lbu s0, 25(a0)
+; RV64I-NEXT: lbu s1, 26(a0)
+; RV64I-NEXT: lbu s2, 27(a0)
; RV64I-NEXT: slli s5, s5, 8
; RV64I-NEXT: slli s6, s6, 16
; RV64I-NEXT: slli s7, s7, 24
-; RV64I-NEXT: or t1, s1, s0
-; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: slli s9, s9, 8
; RV64I-NEXT: or t3, s5, s4
; RV64I-NEXT: or t4, s7, s6
-; RV64I-NEXT: lbu s0, 24(a0)
-; RV64I-NEXT: lbu s1, 25(a0)
-; RV64I-NEXT: lbu s2, 26(a0)
-; RV64I-NEXT: lbu s3, 27(a0)
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s8, s8, 16
-; RV64I-NEXT: slli s9, s9, 24
-; RV64I-NEXT: slli s1, s1, 8
-; RV64I-NEXT: or t5, t6, t5
-; RV64I-NEXT: or t6, s9, s8
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: lbu s1, 28(a0)
+; RV64I-NEXT: or t5, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
; RV64I-NEXT: lbu s4, 29(a0)
; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu s6, 31(a0)
-; RV64I-NEXT: lbu a0, 0(a1)
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: slli s2, s2, 24
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or a0, s11, s10
+; RV64I-NEXT: or t6, s0, t6
+; RV64I-NEXT: or s0, s2, s1
+; RV64I-NEXT: or s1, s4, s3
+; RV64I-NEXT: lbu s2, 0(a1)
+; RV64I-NEXT: lbu s3, 1(a1)
+; RV64I-NEXT: lbu s4, 2(a1)
+; RV64I-NEXT: lbu s7, 3(a1)
+; RV64I-NEXT: slli s5, s5, 16
+; RV64I-NEXT: slli s6, s6, 24
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: or s5, s6, s5
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, s7, s4
+; RV64I-NEXT: lbu s4, 5(a1)
+; RV64I-NEXT: lbu s6, 4(a1)
+; RV64I-NEXT: lbu s7, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or s4, s4, s6
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, s7
; RV64I-NEXT: sd zero, 32(sp)
; RV64I-NEXT: sd zero, 40(sp)
; RV64I-NEXT: sd zero, 48(sp)
; RV64I-NEXT: sd zero, 56(sp)
-; RV64I-NEXT: slli s2, s2, 16
-; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: or a1, s3, s2
-; RV64I-NEXT: mv s2, sp
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
-; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: or s1, s4, s1
-; RV64I-NEXT: srli s3, a0, 3
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: andi s5, a0, 63
-; RV64I-NEXT: andi s3, s3, 24
-; RV64I-NEXT: xori s5, s5, 63
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: or a1, a1, s0
-; RV64I-NEXT: or t1, s4, s1
-; RV64I-NEXT: add s2, s2, s3
-; RV64I-NEXT: slli a4, a4, 32
-; RV64I-NEXT: slli a6, a6, 32
-; RV64I-NEXT: slli t0, t0, 32
-; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: mv a6, sp
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a1, t1, a1
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: or t0, t4, t3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: or t1, s0, t6
+; RV64I-NEXT: or t2, s5, s1
+; RV64I-NEXT: or t3, s3, s2
+; RV64I-NEXT: or a1, a1, s4
+; RV64I-NEXT: slli a3, a3, 32
+; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli t2, t2, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a3, a3, a5
+; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: or a5, t2, t1
+; RV64I-NEXT: or a1, a1, t3
; RV64I-NEXT: sd a3, 0(sp)
; RV64I-NEXT: sd a4, 8(sp)
-; RV64I-NEXT: sd a5, 16(sp)
-; RV64I-NEXT: sd a1, 24(sp)
-; RV64I-NEXT: ld a1, 8(s2)
-; RV64I-NEXT: ld a3, 16(s2)
-; RV64I-NEXT: ld a4, 0(s2)
-; RV64I-NEXT: ld a5, 24(s2)
-; RV64I-NEXT: srl a6, a1, a0
-; RV64I-NEXT: slli a7, a3, 1
-; RV64I-NEXT: srl a4, a4, a0
-; RV64I-NEXT: slli a1, a1, 1
-; RV64I-NEXT: srl a3, a3, a0
+; RV64I-NEXT: sd a0, 16(sp)
+; RV64I-NEXT: sd a5, 24(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a3, a1, 63
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: xori a3, a3, 63
+; RV64I-NEXT: add a0, a6, a0
+; RV64I-NEXT: ld a4, 8(a0)
+; RV64I-NEXT: ld a5, 16(a0)
+; RV64I-NEXT: ld a6, 0(a0)
+; RV64I-NEXT: ld a0, 24(a0)
+; RV64I-NEXT: srl a7, a4, a1
; RV64I-NEXT: slli t0, a5, 1
-; RV64I-NEXT: srl a5, a5, a0
-; RV64I-NEXT: sll a0, a7, s5
-; RV64I-NEXT: sll a1, a1, s5
-; RV64I-NEXT: sll a7, t0, s5
-; RV64I-NEXT: srli t0, a5, 56
-; RV64I-NEXT: srli t1, a5, 48
-; RV64I-NEXT: srli t2, a5, 40
-; RV64I-NEXT: srli t3, a5, 32
-; RV64I-NEXT: srli t4, a5, 24
-; RV64I-NEXT: srli t5, a5, 16
-; RV64I-NEXT: srli t6, a5, 8
-; RV64I-NEXT: or a0, a6, a0
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: or a3, a3, a7
+; RV64I-NEXT: srl a6, a6, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: srl a5, a5, a1
+; RV64I-NEXT: slli t1, a0, 1
+; RV64I-NEXT: srl t2, a0, a1
+; RV64I-NEXT: sll a0, t0, a3
+; RV64I-NEXT: sll a1, a4, a3
+; RV64I-NEXT: sll a3, t1, a3
+; RV64I-NEXT: srli a4, t2, 56
+; RV64I-NEXT: srli t0, t2, 48
+; RV64I-NEXT: srli t1, t2, 40
+; RV64I-NEXT: srli t3, t2, 32
+; RV64I-NEXT: srli t4, t2, 24
+; RV64I-NEXT: srli t5, t2, 16
+; RV64I-NEXT: srli t6, t2, 8
+; RV64I-NEXT: or a0, a7, a0
+; RV64I-NEXT: or a1, a6, a1
+; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: sb t3, 28(a2)
-; RV64I-NEXT: sb t2, 29(a2)
-; RV64I-NEXT: sb t1, 30(a2)
-; RV64I-NEXT: sb t0, 31(a2)
-; RV64I-NEXT: sb a5, 24(a2)
+; RV64I-NEXT: sb t1, 29(a2)
+; RV64I-NEXT: sb t0, 30(a2)
+; RV64I-NEXT: sb a4, 31(a2)
+; RV64I-NEXT: sb t2, 24(a2)
; RV64I-NEXT: sb t6, 25(a2)
; RV64I-NEXT: sb t5, 26(a2)
; RV64I-NEXT: sb t4, 27(a2)
@@ -1463,17 +1513,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: sb a1, 9(a2)
; RV64I-NEXT: sb a5, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
-; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 144
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: lshr_32bytes:
@@ -1498,55 +1550,67 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a7, 3(a0)
; RV32I-NEXT: lbu a5, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t6, 7(a0)
-; RV32I-NEXT: lbu s2, 8(a0)
-; RV32I-NEXT: lbu s3, 9(a0)
-; RV32I-NEXT: lbu s4, 10(a0)
-; RV32I-NEXT: lbu s5, 11(a0)
-; RV32I-NEXT: lbu s7, 12(a0)
-; RV32I-NEXT: lbu s8, 13(a0)
-; RV32I-NEXT: lbu s9, 14(a0)
-; RV32I-NEXT: lbu s10, 15(a0)
-; RV32I-NEXT: lbu s11, 16(a0)
-; RV32I-NEXT: lbu ra, 17(a0)
-; RV32I-NEXT: lbu t4, 18(a0)
-; RV32I-NEXT: lbu s0, 19(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s2, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: or a4, a7, a6
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 21(a0)
-; RV32I-NEXT: lbu t5, 22(a0)
-; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu ra, 22(a0)
+; RV32I-NEXT: lbu a3, 23(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or a5, t0, a5
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: lbu s1, 24(a0)
+; RV32I-NEXT: lbu s3, 25(a0)
+; RV32I-NEXT: lbu t4, 26(a0)
+; RV32I-NEXT: lbu t5, 27(a0)
+; RV32I-NEXT: slli s2, s2, 8
; RV32I-NEXT: slli s4, s4, 16
; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t6, t3
-; RV32I-NEXT: or a7, s3, s2
-; RV32I-NEXT: or t0, s5, s4
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu s5, 25(a0)
-; RV32I-NEXT: lbu s6, 26(a0)
-; RV32I-NEXT: lbu t6, 27(a0)
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: slli s9, s9, 16
-; RV32I-NEXT: slli s10, s10, 24
-; RV32I-NEXT: slli ra, ra, 8
-; RV32I-NEXT: or s7, s8, s7
-; RV32I-NEXT: or s2, s10, s9
-; RV32I-NEXT: or s3, ra, s11
-; RV32I-NEXT: lbu s4, 28(a0)
-; RV32I-NEXT: lbu s8, 29(a0)
-; RV32I-NEXT: lbu s9, 30(a0)
-; RV32I-NEXT: lbu s10, 31(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: or t3, s7, s6
+; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu s6, 31(a0)
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: or a0, s9, s8
+; RV32I-NEXT: or s0, s11, s10
+; RV32I-NEXT: or s2, a3, ra
+; RV32I-NEXT: lbu a3, 0(a1)
+; RV32I-NEXT: lbu s7, 1(a1)
+; RV32I-NEXT: lbu s8, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
; RV32I-NEXT: sw zero, 64(sp)
@@ -1555,90 +1619,89 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sw zero, 44(sp)
; RV32I-NEXT: sw zero, 48(sp)
; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: addi s3, sp, 8
; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or t4, s0, t4
-; RV32I-NEXT: addi s0, sp, 8
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s5, s5, 8
-; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: slli s9, s9, 16
-; RV32I-NEXT: slli s10, s10, 24
-; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s4, t6
+; RV32I-NEXT: or t6, s6, s5
+; RV32I-NEXT: or a3, s7, a3
+; RV32I-NEXT: or a1, a1, s8
+; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, s4
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
+; RV32I-NEXT: or t0, a0, t3
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, t4, s1
+; RV32I-NEXT: or t3, t6, t5
+; RV32I-NEXT: or a0, a1, a3
+; RV32I-NEXT: sw t0, 24(sp)
+; RV32I-NEXT: sw t1, 28(sp)
+; RV32I-NEXT: sw t2, 32(sp)
+; RV32I-NEXT: sw t3, 36(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a6, 16(sp)
+; RV32I-NEXT: sw a7, 20(sp)
; RV32I-NEXT: srli a1, a0, 3
-; RV32I-NEXT: or t2, s1, t5
-; RV32I-NEXT: andi t5, a0, 31
-; RV32I-NEXT: or t3, s5, t3
-; RV32I-NEXT: or t6, t6, s6
-; RV32I-NEXT: or s1, s8, s4
-; RV32I-NEXT: or s4, s10, s9
-; RV32I-NEXT: andi s5, a1, 28
-; RV32I-NEXT: xori a1, t5, 31
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, s2, s7
-; RV32I-NEXT: or a7, t4, s3
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or t1, t6, t3
-; RV32I-NEXT: or t2, s4, s1
-; RV32I-NEXT: add s0, s0, s5
-; RV32I-NEXT: sw a7, 24(sp)
-; RV32I-NEXT: sw t0, 28(sp)
-; RV32I-NEXT: sw t1, 32(sp)
-; RV32I-NEXT: sw t2, 36(sp)
-; RV32I-NEXT: sw a3, 8(sp)
-; RV32I-NEXT: sw a4, 12(sp)
-; RV32I-NEXT: sw a5, 16(sp)
-; RV32I-NEXT: sw a6, 20(sp)
-; RV32I-NEXT: lw a3, 0(s0)
-; RV32I-NEXT: lw a4, 4(s0)
-; RV32I-NEXT: lw a5, 8(s0)
-; RV32I-NEXT: lw a6, 12(s0)
-; RV32I-NEXT: lw a7, 16(s0)
-; RV32I-NEXT: lw t0, 20(s0)
-; RV32I-NEXT: lw t1, 24(s0)
-; RV32I-NEXT: lw t2, 28(s0)
-; RV32I-NEXT: srl t3, a4, a0
-; RV32I-NEXT: slli t4, a5, 1
+; RV32I-NEXT: andi a3, a0, 31
+; RV32I-NEXT: andi a4, a1, 28
+; RV32I-NEXT: xori a1, a3, 31
+; RV32I-NEXT: add a4, s3, a4
+; RV32I-NEXT: lw a3, 0(a4)
+; RV32I-NEXT: lw a5, 4(a4)
+; RV32I-NEXT: lw a6, 8(a4)
+; RV32I-NEXT: lw a7, 12(a4)
+; RV32I-NEXT: lw t0, 16(a4)
+; RV32I-NEXT: lw t1, 20(a4)
+; RV32I-NEXT: lw t2, 24(a4)
+; RV32I-NEXT: lw a4, 28(a4)
+; RV32I-NEXT: srl t3, a5, a0
+; RV32I-NEXT: slli t4, a6, 1
; RV32I-NEXT: srl a3, a3, a0
-; RV32I-NEXT: slli a4, a4, 1
-; RV32I-NEXT: srl t5, a6, a0
-; RV32I-NEXT: slli t6, a7, 1
-; RV32I-NEXT: srl a5, a5, a0
-; RV32I-NEXT: slli a6, a6, 1
-; RV32I-NEXT: srl s0, t0, a0
-; RV32I-NEXT: slli s1, t1, 1
-; RV32I-NEXT: srl a7, a7, a0
-; RV32I-NEXT: slli t0, t0, 1
-; RV32I-NEXT: srl t1, t1, a0
-; RV32I-NEXT: slli s2, t2, 1
+; RV32I-NEXT: slli a5, a5, 1
+; RV32I-NEXT: srl t5, a7, a0
+; RV32I-NEXT: slli t6, t0, 1
+; RV32I-NEXT: srl a6, a6, a0
+; RV32I-NEXT: slli a7, a7, 1
+; RV32I-NEXT: srl s0, t1, a0
+; RV32I-NEXT: slli s1, t2, 1
+; RV32I-NEXT: srl t0, t0, a0
+; RV32I-NEXT: slli t1, t1, 1
; RV32I-NEXT: srl t2, t2, a0
+; RV32I-NEXT: slli s2, a4, 1
+; RV32I-NEXT: srl s3, a4, a0
; RV32I-NEXT: sll a0, t4, a1
-; RV32I-NEXT: sll a4, a4, a1
-; RV32I-NEXT: sll t4, t6, a1
-; RV32I-NEXT: sll a6, a6, a1
-; RV32I-NEXT: sll t6, s1, a1
-; RV32I-NEXT: sll t0, t0, a1
-; RV32I-NEXT: sll s1, s2, a1
-; RV32I-NEXT: srli s2, t2, 24
-; RV32I-NEXT: srli s3, t2, 16
-; RV32I-NEXT: srli s4, t2, 8
+; RV32I-NEXT: sll a4, a5, a1
+; RV32I-NEXT: sll a5, t6, a1
+; RV32I-NEXT: sll a7, a7, a1
+; RV32I-NEXT: sll t4, s1, a1
+; RV32I-NEXT: sll t1, t1, a1
+; RV32I-NEXT: sll t6, s2, a1
+; RV32I-NEXT: srli s1, s3, 24
+; RV32I-NEXT: srli s2, s3, 16
+; RV32I-NEXT: srli s4, s3, 8
; RV32I-NEXT: or a0, t3, a0
; RV32I-NEXT: or a1, a3, a4
-; RV32I-NEXT: or a3, t5, t4
-; RV32I-NEXT: or a4, a5, a6
-; RV32I-NEXT: or a5, s0, t6
-; RV32I-NEXT: or a6, a7, t0
-; RV32I-NEXT: or a7, t1, s1
-; RV32I-NEXT: sb t2, 28(a2)
+; RV32I-NEXT: or a3, t5, a5
+; RV32I-NEXT: or a4, a6, a7
+; RV32I-NEXT: or a5, s0, t4
+; RV32I-NEXT: or a6, t0, t1
+; RV32I-NEXT: or a7, t2, t6
+; RV32I-NEXT: sb s3, 28(a2)
; RV32I-NEXT: sb s4, 29(a2)
-; RV32I-NEXT: sb s3, 30(a2)
-; RV32I-NEXT: sb s2, 31(a2)
+; RV32I-NEXT: sb s2, 30(a2)
+; RV32I-NEXT: sb s1, 31(a2)
; RV32I-NEXT: srli t0, a7, 24
; RV32I-NEXT: srli t1, a7, 16
; RV32I-NEXT: srli t2, a7, 8
@@ -1712,17 +1775,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: shl_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -144
-; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
; RV64I-NEXT: lbu a3, 0(a0)
; RV64I-NEXT: lbu a4, 1(a0)
; RV64I-NEXT: lbu a5, 2(a0)
@@ -1739,125 +1804,146 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: lbu s1, 13(a0)
; RV64I-NEXT: lbu s2, 14(a0)
; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: lbu s4, 16(a0)
; RV64I-NEXT: lbu s5, 17(a0)
; RV64I-NEXT: lbu s6, 18(a0)
; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli s8, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a5, a4, a3
+; RV64I-NEXT: or a6, a6, s8
+; RV64I-NEXT: or a3, t0, a7
+; RV64I-NEXT: or a4, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
; RV64I-NEXT: slli t4, t4, 8
; RV64I-NEXT: slli t5, t5, 16
; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: lbu t5, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s8, 22(a0)
-; RV64I-NEXT: lbu s9, 23(a0)
; RV64I-NEXT: slli s1, s1, 8
; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t6, 24(a0)
+; RV64I-NEXT: lbu s0, 25(a0)
+; RV64I-NEXT: lbu s1, 26(a0)
+; RV64I-NEXT: lbu s2, 27(a0)
; RV64I-NEXT: slli s5, s5, 8
; RV64I-NEXT: slli s6, s6, 16
; RV64I-NEXT: slli s7, s7, 24
-; RV64I-NEXT: or t1, s1, s0
-; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: slli s9, s9, 8
; RV64I-NEXT: or t3, s5, s4
; RV64I-NEXT: or t4, s7, s6
-; RV64I-NEXT: lbu s0, 24(a0)
-; RV64I-NEXT: lbu s1, 25(a0)
-; RV64I-NEXT: lbu s2, 26(a0)
-; RV64I-NEXT: lbu s3, 27(a0)
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s8, s8, 16
-; RV64I-NEXT: slli s9, s9, 24
-; RV64I-NEXT: slli s1, s1, 8
-; RV64I-NEXT: or t5, t6, t5
-; RV64I-NEXT: or t6, s9, s8
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: lbu s1, 28(a0)
+; RV64I-NEXT: or t5, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
; RV64I-NEXT: lbu s4, 29(a0)
; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu s6, 31(a0)
-; RV64I-NEXT: lbu a0, 0(a1)
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: slli s2, s2, 24
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or a0, s11, s10
+; RV64I-NEXT: or t6, s0, t6
+; RV64I-NEXT: or s0, s2, s1
+; RV64I-NEXT: or s1, s4, s3
+; RV64I-NEXT: lbu s2, 0(a1)
+; RV64I-NEXT: lbu s3, 1(a1)
+; RV64I-NEXT: lbu s4, 2(a1)
+; RV64I-NEXT: lbu s7, 3(a1)
+; RV64I-NEXT: slli s5, s5, 16
+; RV64I-NEXT: slli s6, s6, 24
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: or s5, s6, s5
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, s7, s4
+; RV64I-NEXT: lbu s4, 5(a1)
+; RV64I-NEXT: lbu s6, 4(a1)
+; RV64I-NEXT: lbu s7, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or s4, s4, s6
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, s7
; RV64I-NEXT: sd zero, 0(sp)
; RV64I-NEXT: sd zero, 8(sp)
; RV64I-NEXT: sd zero, 16(sp)
; RV64I-NEXT: sd zero, 24(sp)
-; RV64I-NEXT: slli s2, s2, 16
-; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: or a1, s3, s2
-; RV64I-NEXT: addi s2, sp, 32
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
-; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: or s1, s4, s1
-; RV64I-NEXT: srli s3, a0, 3
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: andi s5, a0, 63
-; RV64I-NEXT: andi s3, s3, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: or a1, a1, s0
-; RV64I-NEXT: or t1, s4, s1
-; RV64I-NEXT: sub t2, s2, s3
-; RV64I-NEXT: slli a4, a4, 32
-; RV64I-NEXT: slli a6, a6, 32
-; RV64I-NEXT: slli t0, t0, 32
-; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: addi a6, sp, 32
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a1, t1, a1
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: or t0, t4, t3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: or t1, s0, t6
+; RV64I-NEXT: or t2, s5, s1
+; RV64I-NEXT: or t3, s3, s2
+; RV64I-NEXT: or a1, a1, s4
+; RV64I-NEXT: slli a3, a3, 32
+; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli t2, t2, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a3, a3, a5
+; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: or a5, t2, t1
+; RV64I-NEXT: or a1, a1, t3
; RV64I-NEXT: sd a3, 32(sp)
; RV64I-NEXT: sd a4, 40(sp)
-; RV64I-NEXT: sd a5, 48(sp)
-; RV64I-NEXT: sd a1, 56(sp)
-; RV64I-NEXT: ld a1, 0(t2)
-; RV64I-NEXT: ld a3, 8(t2)
-; RV64I-NEXT: ld a4, 16(t2)
-; RV64I-NEXT: ld a5, 24(t2)
-; RV64I-NEXT: xori a6, s5, 63
-; RV64I-NEXT: sll a7, a3, a0
-; RV64I-NEXT: srli t0, a1, 1
-; RV64I-NEXT: sll a5, a5, a0
-; RV64I-NEXT: srli t1, a4, 1
-; RV64I-NEXT: sll a4, a4, a0
-; RV64I-NEXT: srli a3, a3, 1
-; RV64I-NEXT: sll t2, a1, a0
-; RV64I-NEXT: srl a0, t0, a6
-; RV64I-NEXT: srl a1, t1, a6
-; RV64I-NEXT: srl a3, a3, a6
-; RV64I-NEXT: srli a6, t2, 56
-; RV64I-NEXT: srli t0, t2, 48
-; RV64I-NEXT: srli t1, t2, 40
-; RV64I-NEXT: srli t3, t2, 32
-; RV64I-NEXT: srli t4, t2, 24
-; RV64I-NEXT: srli t5, t2, 16
-; RV64I-NEXT: srli t6, t2, 8
-; RV64I-NEXT: or a0, a7, a0
-; RV64I-NEXT: or a1, a5, a1
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: sb t3, 4(a2)
-; RV64I-NEXT: sb t1, 5(a2)
-; RV64I-NEXT: sb t0, 6(a2)
-; RV64I-NEXT: sb a6, 7(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t6, 1(a2)
-; RV64I-NEXT: sb t5, 2(a2)
-; RV64I-NEXT: sb t4, 3(a2)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a5, 56(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a3, a1, 63
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: sub a0, a6, a0
+; RV64I-NEXT: ld a4, 0(a0)
+; RV64I-NEXT: ld a5, 8(a0)
+; RV64I-NEXT: ld a6, 16(a0)
+; RV64I-NEXT: ld a0, 24(a0)
+; RV64I-NEXT: xori a3, a3, 63
+; RV64I-NEXT: sll a7, a5, a1
+; RV64I-NEXT: srli t0, a4, 1
+; RV64I-NEXT: sll t1, a0, a1
+; RV64I-NEXT: srli a0, a6, 1
+; RV64I-NEXT: sll a6, a6, a1
+; RV64I-NEXT: srli a5, a5, 1
+; RV64I-NEXT: sll a4, a4, a1
+; RV64I-NEXT: srl a1, t0, a3
+; RV64I-NEXT: srl t0, a0, a3
+; RV64I-NEXT: srl a3, a5, a3
+; RV64I-NEXT: srli a5, a4, 56
+; RV64I-NEXT: srli t2, a4, 48
+; RV64I-NEXT: srli t3, a4, 40
+; RV64I-NEXT: srli t4, a4, 32
+; RV64I-NEXT: srli t5, a4, 24
+; RV64I-NEXT: srli t6, a4, 16
+; RV64I-NEXT: srli s0, a4, 8
+; RV64I-NEXT: or a0, a7, a1
+; RV64I-NEXT: or a1, t1, t0
+; RV64I-NEXT: or a3, a6, a3
+; RV64I-NEXT: sb t4, 4(a2)
+; RV64I-NEXT: sb t3, 5(a2)
+; RV64I-NEXT: sb t2, 6(a2)
+; RV64I-NEXT: sb a5, 7(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s0, 1(a2)
+; RV64I-NEXT: sb t6, 2(a2)
+; RV64I-NEXT: sb t5, 3(a2)
; RV64I-NEXT: srli a4, a3, 56
; RV64I-NEXT: srli a5, a3, 48
; RV64I-NEXT: srli a6, a3, 40
@@ -1903,17 +1989,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: sb a1, 9(a2)
; RV64I-NEXT: sb a5, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
-; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 144
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: shl_32bytes:
@@ -1938,55 +2026,67 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a7, 3(a0)
; RV32I-NEXT: lbu a5, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t6, 7(a0)
-; RV32I-NEXT: lbu s2, 8(a0)
-; RV32I-NEXT: lbu s3, 9(a0)
-; RV32I-NEXT: lbu s4, 10(a0)
-; RV32I-NEXT: lbu s5, 11(a0)
-; RV32I-NEXT: lbu s7, 12(a0)
-; RV32I-NEXT: lbu s8, 13(a0)
-; RV32I-NEXT: lbu s9, 14(a0)
-; RV32I-NEXT: lbu s10, 15(a0)
-; RV32I-NEXT: lbu s11, 16(a0)
-; RV32I-NEXT: lbu ra, 17(a0)
-; RV32I-NEXT: lbu t4, 18(a0)
-; RV32I-NEXT: lbu s0, 19(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s2, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: or a4, a7, a6
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 21(a0)
-; RV32I-NEXT: lbu t5, 22(a0)
-; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu ra, 22(a0)
+; RV32I-NEXT: lbu a3, 23(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or a5, t0, a5
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: lbu s1, 24(a0)
+; RV32I-NEXT: lbu s3, 25(a0)
+; RV32I-NEXT: lbu t4, 26(a0)
+; RV32I-NEXT: lbu t5, 27(a0)
+; RV32I-NEXT: slli s2, s2, 8
; RV32I-NEXT: slli s4, s4, 16
; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t6, t3
-; RV32I-NEXT: or a7, s3, s2
-; RV32I-NEXT: or t0, s5, s4
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu s5, 25(a0)
-; RV32I-NEXT: lbu s6, 26(a0)
-; RV32I-NEXT: lbu t6, 27(a0)
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: slli s9, s9, 16
-; RV32I-NEXT: slli s10, s10, 24
-; RV32I-NEXT: slli ra, ra, 8
-; RV32I-NEXT: or s7, s8, s7
-; RV32I-NEXT: or s2, s10, s9
-; RV32I-NEXT: or s3, ra, s11
-; RV32I-NEXT: lbu s4, 28(a0)
-; RV32I-NEXT: lbu s8, 29(a0)
-; RV32I-NEXT: lbu s9, 30(a0)
-; RV32I-NEXT: lbu s10, 31(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: or t3, s7, s6
+; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu s6, 31(a0)
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: or a0, s9, s8
+; RV32I-NEXT: or s0, s11, s10
+; RV32I-NEXT: or s2, a3, ra
+; RV32I-NEXT: lbu a3, 0(a1)
+; RV32I-NEXT: lbu s7, 1(a1)
+; RV32I-NEXT: lbu s8, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
; RV32I-NEXT: sw zero, 32(sp)
@@ -1995,89 +2095,88 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sw zero, 12(sp)
; RV32I-NEXT: sw zero, 16(sp)
; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: addi s3, sp, 40
; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or t4, s0, t4
-; RV32I-NEXT: addi s0, sp, 40
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s5, s5, 8
-; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: slli s9, s9, 16
-; RV32I-NEXT: slli s10, s10, 24
-; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s4, t6
+; RV32I-NEXT: or t6, s6, s5
+; RV32I-NEXT: or a3, s7, a3
+; RV32I-NEXT: or a1, a1, s8
+; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, s4
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
+; RV32I-NEXT: or t0, a0, t3
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, t4, s1
+; RV32I-NEXT: or t3, t6, t5
+; RV32I-NEXT: or a0, a1, a3
+; RV32I-NEXT: sw t0, 56(sp)
+; RV32I-NEXT: sw t1, 60(sp)
+; RV32I-NEXT: sw t2, 64(sp)
+; RV32I-NEXT: sw t3, 68(sp)
+; RV32I-NEXT: sw a4, 40(sp)
+; RV32I-NEXT: sw a5, 44(sp)
+; RV32I-NEXT: sw a6, 48(sp)
+; RV32I-NEXT: sw a7, 52(sp)
; RV32I-NEXT: srli a1, a0, 3
-; RV32I-NEXT: or t2, s1, t5
-; RV32I-NEXT: andi t5, a0, 31
-; RV32I-NEXT: or t3, s5, t3
-; RV32I-NEXT: or t6, t6, s6
-; RV32I-NEXT: or s1, s8, s4
-; RV32I-NEXT: or s4, s10, s9
-; RV32I-NEXT: andi s5, a1, 28
-; RV32I-NEXT: xori a1, t5, 31
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, s2, s7
-; RV32I-NEXT: or a7, t4, s3
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or t1, t6, t3
-; RV32I-NEXT: or t2, s4, s1
-; RV32I-NEXT: sub t3, s0, s5
-; RV32I-NEXT: sw a7, 56(sp)
-; RV32I-NEXT: sw t0, 60(sp)
-; RV32I-NEXT: sw t1, 64(sp)
-; RV32I-NEXT: sw t2, 68(sp)
-; RV32I-NEXT: sw a3, 40(sp)
-; RV32I-NEXT: sw a4, 44(sp)
-; RV32I-NEXT: sw a5, 48(sp)
-; RV32I-NEXT: sw a6, 52(sp)
-; RV32I-NEXT: lw a3, 0(t3)
-; RV32I-NEXT: lw a4, 4(t3)
-; RV32I-NEXT: lw a5, 8(t3)
-; RV32I-NEXT: lw a6, 12(t3)
-; RV32I-NEXT: lw a7, 16(t3)
-; RV32I-NEXT: lw t0, 20(t3)
-; RV32I-NEXT: lw t1, 24(t3)
-; RV32I-NEXT: lw t2, 28(t3)
-; RV32I-NEXT: sll t3, a4, a0
-; RV32I-NEXT: srli t4, a3, 1
-; RV32I-NEXT: sll t5, a6, a0
-; RV32I-NEXT: srli t6, a5, 1
-; RV32I-NEXT: sll a5, a5, a0
-; RV32I-NEXT: srli a4, a4, 1
-; RV32I-NEXT: sll s0, t0, a0
-; RV32I-NEXT: srli s1, a7, 1
-; RV32I-NEXT: sll a7, a7, a0
-; RV32I-NEXT: srli a6, a6, 1
+; RV32I-NEXT: andi a3, a0, 31
+; RV32I-NEXT: andi a4, a1, 28
+; RV32I-NEXT: xori a1, a3, 31
+; RV32I-NEXT: sub a3, s3, a4
+; RV32I-NEXT: lw a4, 0(a3)
+; RV32I-NEXT: lw a5, 4(a3)
+; RV32I-NEXT: lw a6, 8(a3)
+; RV32I-NEXT: lw a7, 12(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw t1, 20(a3)
+; RV32I-NEXT: lw t2, 24(a3)
+; RV32I-NEXT: lw a3, 28(a3)
+; RV32I-NEXT: sll t3, a5, a0
+; RV32I-NEXT: srli t4, a4, 1
+; RV32I-NEXT: sll t5, a7, a0
+; RV32I-NEXT: srli t6, a6, 1
+; RV32I-NEXT: sll a6, a6, a0
+; RV32I-NEXT: srli a5, a5, 1
+; RV32I-NEXT: sll s0, t1, a0
+; RV32I-NEXT: srli s1, t0, 1
+; RV32I-NEXT: sll t0, t0, a0
+; RV32I-NEXT: srli a7, a7, 1
+; RV32I-NEXT: sll s2, a3, a0
+; RV32I-NEXT: srli a3, t2, 1
; RV32I-NEXT: sll t2, t2, a0
-; RV32I-NEXT: srli s2, t1, 1
-; RV32I-NEXT: sll t1, t1, a0
-; RV32I-NEXT: srli t0, t0, 1
-; RV32I-NEXT: sll s3, a3, a0
+; RV32I-NEXT: srli t1, t1, 1
+; RV32I-NEXT: sll s3, a4, a0
; RV32I-NEXT: srl a0, t4, a1
-; RV32I-NEXT: srl a3, t6, a1
-; RV32I-NEXT: srl a4, a4, a1
+; RV32I-NEXT: srl a4, t6, a1
+; RV32I-NEXT: srl a5, a5, a1
; RV32I-NEXT: srl t4, s1, a1
-; RV32I-NEXT: srl a6, a6, a1
-; RV32I-NEXT: srl t6, s2, a1
-; RV32I-NEXT: srl t0, t0, a1
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: srl t6, a3, a1
+; RV32I-NEXT: srl t1, t1, a1
; RV32I-NEXT: srli s1, s3, 24
-; RV32I-NEXT: srli s2, s3, 16
-; RV32I-NEXT: srli s4, s3, 8
+; RV32I-NEXT: srli s4, s3, 16
+; RV32I-NEXT: srli s5, s3, 8
; RV32I-NEXT: or a0, t3, a0
-; RV32I-NEXT: or a1, t5, a3
-; RV32I-NEXT: or a3, a5, a4
+; RV32I-NEXT: or a1, t5, a4
+; RV32I-NEXT: or a3, a6, a5
; RV32I-NEXT: or a4, s0, t4
-; RV32I-NEXT: or a5, a7, a6
-; RV32I-NEXT: or a6, t2, t6
-; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, s2, t6
+; RV32I-NEXT: or a7, t2, t1
; RV32I-NEXT: sb s3, 0(a2)
-; RV32I-NEXT: sb s4, 1(a2)
-; RV32I-NEXT: sb s2, 2(a2)
+; RV32I-NEXT: sb s5, 1(a2)
+; RV32I-NEXT: sb s4, 2(a2)
; RV32I-NEXT: sb s1, 3(a2)
; RV32I-NEXT: srli t0, a7, 24
; RV32I-NEXT: srli t1, a7, 16
@@ -2152,17 +2251,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: ashr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -144
-; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
; RV64I-NEXT: lbu a3, 0(a0)
; RV64I-NEXT: lbu a4, 1(a0)
; RV64I-NEXT: lbu a5, 2(a0)
@@ -2179,123 +2280,144 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: lbu s1, 13(a0)
; RV64I-NEXT: lbu s2, 14(a0)
; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: lbu s4, 16(a0)
; RV64I-NEXT: lbu s5, 17(a0)
; RV64I-NEXT: lbu s6, 18(a0)
; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
; RV64I-NEXT: slli t4, t4, 8
; RV64I-NEXT: slli t5, t5, 16
; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: lbu t5, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s8, 22(a0)
-; RV64I-NEXT: lbu s9, 23(a0)
; RV64I-NEXT: slli s1, s1, 8
; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t6, 24(a0)
+; RV64I-NEXT: lbu s0, 25(a0)
+; RV64I-NEXT: lbu s1, 26(a0)
+; RV64I-NEXT: lbu s2, 27(a0)
; RV64I-NEXT: slli s5, s5, 8
; RV64I-NEXT: slli s6, s6, 16
; RV64I-NEXT: slli s7, s7, 24
-; RV64I-NEXT: or t1, s1, s0
-; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: slli s9, s9, 8
; RV64I-NEXT: or t3, s5, s4
; RV64I-NEXT: or t4, s7, s6
-; RV64I-NEXT: lbu s0, 24(a0)
-; RV64I-NEXT: lbu s1, 25(a0)
-; RV64I-NEXT: lbu s2, 26(a0)
-; RV64I-NEXT: lbu s3, 27(a0)
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s8, s8, 16
-; RV64I-NEXT: slli s9, s9, 24
-; RV64I-NEXT: slli s1, s1, 8
-; RV64I-NEXT: or t5, t6, t5
-; RV64I-NEXT: or t6, s9, s8
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: lbu s1, 28(a0)
+; RV64I-NEXT: or t5, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
; RV64I-NEXT: lbu s4, 29(a0)
; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu s6, 31(a0)
-; RV64I-NEXT: lbu a0, 0(a1)
-; RV64I-NEXT: slli s2, s2, 16
-; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: or a1, s3, s2
-; RV64I-NEXT: mv s2, sp
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: slli s2, s2, 24
; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or a0, s11, s10
+; RV64I-NEXT: or t6, s0, t6
+; RV64I-NEXT: or s0, s2, s1
+; RV64I-NEXT: or s1, s4, s3
+; RV64I-NEXT: lbu s2, 0(a1)
+; RV64I-NEXT: lbu s3, 1(a1)
+; RV64I-NEXT: lbu s4, 2(a1)
+; RV64I-NEXT: lbu s7, 3(a1)
; RV64I-NEXT: slli s5, s5, 16
; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: or s1, s4, s1
-; RV64I-NEXT: srli s3, a0, 3
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: andi s5, a0, 63
-; RV64I-NEXT: andi s3, s3, 24
-; RV64I-NEXT: xori s5, s5, 63
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: or s5, s6, s5
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, s7, s4
+; RV64I-NEXT: lbu s4, 5(a1)
+; RV64I-NEXT: lbu s6, 4(a1)
+; RV64I-NEXT: lbu s7, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or s4, s4, s6
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, s7
+; RV64I-NEXT: mv s6, sp
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: or a5, t0, a7
; RV64I-NEXT: or a6, t2, t1
; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: or a1, a1, s0
-; RV64I-NEXT: or t1, s4, s1
-; RV64I-NEXT: add s2, s2, s3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: or t0, s0, t6
+; RV64I-NEXT: or t1, s5, s1
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: or a1, a1, s4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: slli a6, a6, 32
-; RV64I-NEXT: slli t0, t0, 32
-; RV64I-NEXT: slli t2, t1, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli t3, t1, 32
+; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: sraiw t1, t1, 31
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a1, t2, a1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: or a5, t3, t0
+; RV64I-NEXT: or a1, a1, t2
; RV64I-NEXT: sd t1, 32(sp)
; RV64I-NEXT: sd t1, 40(sp)
; RV64I-NEXT: sd t1, 48(sp)
; RV64I-NEXT: sd t1, 56(sp)
; RV64I-NEXT: sd a3, 0(sp)
; RV64I-NEXT: sd a4, 8(sp)
-; RV64I-NEXT: sd a5, 16(sp)
-; RV64I-NEXT: sd a1, 24(sp)
-; RV64I-NEXT: ld a1, 8(s2)
-; RV64I-NEXT: ld a3, 16(s2)
-; RV64I-NEXT: ld a4, 0(s2)
-; RV64I-NEXT: ld a5, 24(s2)
-; RV64I-NEXT: srl a6, a1, a0
-; RV64I-NEXT: slli a7, a3, 1
-; RV64I-NEXT: srl a4, a4, a0
-; RV64I-NEXT: slli a1, a1, 1
-; RV64I-NEXT: srl a3, a3, a0
+; RV64I-NEXT: sd a0, 16(sp)
+; RV64I-NEXT: sd a5, 24(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a3, a1, 63
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: xori a3, a3, 63
+; RV64I-NEXT: add a0, s6, a0
+; RV64I-NEXT: ld a4, 8(a0)
+; RV64I-NEXT: ld a5, 16(a0)
+; RV64I-NEXT: ld a6, 0(a0)
+; RV64I-NEXT: ld a0, 24(a0)
+; RV64I-NEXT: srl a7, a4, a1
; RV64I-NEXT: slli t0, a5, 1
-; RV64I-NEXT: sra a5, a5, a0
-; RV64I-NEXT: sll a0, a7, s5
-; RV64I-NEXT: sll a1, a1, s5
-; RV64I-NEXT: sll a7, t0, s5
-; RV64I-NEXT: srli t0, a5, 56
-; RV64I-NEXT: srli t1, a5, 48
-; RV64I-NEXT: srli t2, a5, 40
-; RV64I-NEXT: srli t3, a5, 32
-; RV64I-NEXT: srli t4, a5, 24
-; RV64I-NEXT: srli t5, a5, 16
-; RV64I-NEXT: srli t6, a5, 8
-; RV64I-NEXT: or a0, a6, a0
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: or a3, a3, a7
+; RV64I-NEXT: srl a6, a6, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: srl a5, a5, a1
+; RV64I-NEXT: slli t1, a0, 1
+; RV64I-NEXT: sra t2, a0, a1
+; RV64I-NEXT: sll a0, t0, a3
+; RV64I-NEXT: sll a1, a4, a3
+; RV64I-NEXT: sll a3, t1, a3
+; RV64I-NEXT: srli a4, t2, 56
+; RV64I-NEXT: srli t0, t2, 48
+; RV64I-NEXT: srli t1, t2, 40
+; RV64I-NEXT: srli t3, t2, 32
+; RV64I-NEXT: srli t4, t2, 24
+; RV64I-NEXT: srli t5, t2, 16
+; RV64I-NEXT: srli t6, t2, 8
+; RV64I-NEXT: or a0, a7, a0
+; RV64I-NEXT: or a1, a6, a1
+; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: sb t3, 28(a2)
-; RV64I-NEXT: sb t2, 29(a2)
-; RV64I-NEXT: sb t1, 30(a2)
-; RV64I-NEXT: sb t0, 31(a2)
-; RV64I-NEXT: sb a5, 24(a2)
+; RV64I-NEXT: sb t1, 29(a2)
+; RV64I-NEXT: sb t0, 30(a2)
+; RV64I-NEXT: sb a4, 31(a2)
+; RV64I-NEXT: sb t2, 24(a2)
; RV64I-NEXT: sb t6, 25(a2)
; RV64I-NEXT: sb t5, 26(a2)
; RV64I-NEXT: sb t4, 27(a2)
@@ -2316,45 +2438,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: srli s3, a0, 56
; RV64I-NEXT: srli s4, a0, 48
; RV64I-NEXT: srli s5, a0, 40
+; RV64I-NEXT: srli s6, a0, 32
; RV64I-NEXT: sb a7, 20(a2)
; RV64I-NEXT: sb a6, 21(a2)
; RV64I-NEXT: sb a5, 22(a2)
; RV64I-NEXT: sb a4, 23(a2)
-; RV64I-NEXT: srli a4, a0, 32
+; RV64I-NEXT: srli a4, a0, 24
; RV64I-NEXT: sb a3, 16(a2)
; RV64I-NEXT: sb t2, 17(a2)
; RV64I-NEXT: sb t1, 18(a2)
; RV64I-NEXT: sb t0, 19(a2)
-; RV64I-NEXT: srli a3, a0, 24
+; RV64I-NEXT: srli a3, a0, 16
; RV64I-NEXT: sb t6, 4(a2)
; RV64I-NEXT: sb t5, 5(a2)
; RV64I-NEXT: sb t4, 6(a2)
; RV64I-NEXT: sb t3, 7(a2)
-; RV64I-NEXT: srli a5, a0, 16
+; RV64I-NEXT: srli a5, a0, 8
; RV64I-NEXT: sb a1, 0(a2)
; RV64I-NEXT: sb s2, 1(a2)
; RV64I-NEXT: sb s1, 2(a2)
; RV64I-NEXT: sb s0, 3(a2)
-; RV64I-NEXT: srli a1, a0, 8
-; RV64I-NEXT: sb a4, 12(a2)
+; RV64I-NEXT: sb s6, 12(a2)
; RV64I-NEXT: sb s5, 13(a2)
; RV64I-NEXT: sb s4, 14(a2)
; RV64I-NEXT: sb s3, 15(a2)
; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: sb a5, 10(a2)
-; RV64I-NEXT: sb a3, 11(a2)
-; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 144
+; RV64I-NEXT: sb a5, 9(a2)
+; RV64I-NEXT: sb a3, 10(a2)
+; RV64I-NEXT: sb a4, 11(a2)
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: ashr_32bytes:
@@ -2379,148 +2503,159 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a7, 3(a0)
; RV32I-NEXT: lbu a5, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t4, 7(a0)
-; RV32I-NEXT: lbu t6, 8(a0)
-; RV32I-NEXT: lbu s0, 9(a0)
-; RV32I-NEXT: lbu s4, 10(a0)
-; RV32I-NEXT: lbu s5, 11(a0)
-; RV32I-NEXT: lbu s6, 12(a0)
-; RV32I-NEXT: lbu s7, 13(a0)
-; RV32I-NEXT: lbu s8, 14(a0)
-; RV32I-NEXT: lbu s9, 15(a0)
-; RV32I-NEXT: lbu s10, 16(a0)
-; RV32I-NEXT: lbu s11, 17(a0)
-; RV32I-NEXT: lbu s2, 18(a0)
-; RV32I-NEXT: lbu s3, 19(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: or a4, a7, a6
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 21(a0)
-; RV32I-NEXT: lbu t5, 22(a0)
-; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t4, t3
-; RV32I-NEXT: or a7, s0, t6
-; RV32I-NEXT: or t0, s5, s4
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu s4, 25(a0)
-; RV32I-NEXT: lbu s5, 26(a0)
-; RV32I-NEXT: lbu ra, 27(a0)
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: or t4, s7, s6
-; RV32I-NEXT: or t6, s9, s8
-; RV32I-NEXT: or s0, s11, s10
-; RV32I-NEXT: lbu s6, 28(a0)
-; RV32I-NEXT: lbu s7, 29(a0)
-; RV32I-NEXT: lbu s8, 30(a0)
-; RV32I-NEXT: lbu s9, 31(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: lbu ra, 24(a0)
+; RV32I-NEXT: lbu a3, 25(a0)
+; RV32I-NEXT: lbu t4, 26(a0)
+; RV32I-NEXT: lbu t5, 27(a0)
+; RV32I-NEXT: slli s1, s1, 8
; RV32I-NEXT: slli s2, s2, 16
; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: or s2, s3, s2
-; RV32I-NEXT: addi s3, sp, 8
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: slli s5, s5, 16
-; RV32I-NEXT: slli ra, ra, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: or t1, t2, t1
-; RV32I-NEXT: srli a1, a0, 3
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: lbu s0, 29(a0)
+; RV32I-NEXT: lbu s1, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli s6, s6, 16
+; RV32I-NEXT: slli s7, s7, 24
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: or s2, s7, s6
+; RV32I-NEXT: or s3, s9, s8
+; RV32I-NEXT: or s4, s11, s10
+; RV32I-NEXT: lbu s5, 0(a1)
+; RV32I-NEXT: lbu s6, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: addi s8, sp, 8
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s7, s7, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or s1, a0, s1
+; RV32I-NEXT: or t6, s6, s5
+; RV32I-NEXT: or a1, a1, s7
+; RV32I-NEXT: srai s0, a0, 31
+; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, a0
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
+; RV32I-NEXT: or t0, s2, t3
+; RV32I-NEXT: or t1, s4, s3
+; RV32I-NEXT: or a3, t4, a3
; RV32I-NEXT: or t2, s1, t5
-; RV32I-NEXT: andi t5, a0, 31
-; RV32I-NEXT: or t3, s4, t3
-; RV32I-NEXT: or s1, ra, s5
-; RV32I-NEXT: or s4, s7, s6
-; RV32I-NEXT: or s5, s9, s8
-; RV32I-NEXT: srai s6, s9, 31
-; RV32I-NEXT: andi s7, a1, 28
-; RV32I-NEXT: xori a1, t5, 31
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t6, t4
-; RV32I-NEXT: or a7, s2, s0
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or t1, s1, t3
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: sw s6, 56(sp)
-; RV32I-NEXT: sw s6, 60(sp)
-; RV32I-NEXT: sw s6, 64(sp)
-; RV32I-NEXT: sw s6, 68(sp)
-; RV32I-NEXT: sw s6, 40(sp)
-; RV32I-NEXT: sw s6, 44(sp)
-; RV32I-NEXT: sw s6, 48(sp)
-; RV32I-NEXT: sw s6, 52(sp)
-; RV32I-NEXT: add s3, s3, s7
-; RV32I-NEXT: sw a7, 24(sp)
-; RV32I-NEXT: sw t0, 28(sp)
-; RV32I-NEXT: sw t1, 32(sp)
+; RV32I-NEXT: or a0, a1, t6
+; RV32I-NEXT: sw s0, 56(sp)
+; RV32I-NEXT: sw s0, 60(sp)
+; RV32I-NEXT: sw s0, 64(sp)
+; RV32I-NEXT: sw s0, 68(sp)
+; RV32I-NEXT: sw s0, 40(sp)
+; RV32I-NEXT: sw s0, 44(sp)
+; RV32I-NEXT: sw s0, 48(sp)
+; RV32I-NEXT: sw s0, 52(sp)
+; RV32I-NEXT: sw t0, 24(sp)
+; RV32I-NEXT: sw t1, 28(sp)
+; RV32I-NEXT: sw a3, 32(sp)
; RV32I-NEXT: sw t2, 36(sp)
-; RV32I-NEXT: sw a3, 8(sp)
-; RV32I-NEXT: sw a4, 12(sp)
-; RV32I-NEXT: sw a5, 16(sp)
-; RV32I-NEXT: sw a6, 20(sp)
-; RV32I-NEXT: lw a3, 0(s3)
-; RV32I-NEXT: lw a4, 4(s3)
-; RV32I-NEXT: lw a5, 8(s3)
-; RV32I-NEXT: lw a6, 12(s3)
-; RV32I-NEXT: lw a7, 16(s3)
-; RV32I-NEXT: lw t0, 20(s3)
-; RV32I-NEXT: lw t1, 24(s3)
-; RV32I-NEXT: lw t2, 28(s3)
-; RV32I-NEXT: srl t3, a4, a0
-; RV32I-NEXT: slli t4, a5, 1
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a6, 16(sp)
+; RV32I-NEXT: sw a7, 20(sp)
+; RV32I-NEXT: srli a1, a0, 3
+; RV32I-NEXT: andi a3, a0, 31
+; RV32I-NEXT: andi a4, a1, 28
+; RV32I-NEXT: xori a1, a3, 31
+; RV32I-NEXT: add a4, s8, a4
+; RV32I-NEXT: lw a3, 0(a4)
+; RV32I-NEXT: lw a5, 4(a4)
+; RV32I-NEXT: lw a6, 8(a4)
+; RV32I-NEXT: lw a7, 12(a4)
+; RV32I-NEXT: lw t0, 16(a4)
+; RV32I-NEXT: lw t1, 20(a4)
+; RV32I-NEXT: lw t2, 24(a4)
+; RV32I-NEXT: lw a4, 28(a4)
+; RV32I-NEXT: srl t3, a5, a0
+; RV32I-NEXT: slli t4, a6, 1
; RV32I-NEXT: srl a3, a3, a0
-; RV32I-NEXT: slli a4, a4, 1
-; RV32I-NEXT: srl t5, a6, a0
-; RV32I-NEXT: slli t6, a7, 1
-; RV32I-NEXT: srl a5, a5, a0
-; RV32I-NEXT: slli a6, a6, 1
-; RV32I-NEXT: srl s0, t0, a0
-; RV32I-NEXT: slli s1, t1, 1
-; RV32I-NEXT: srl a7, a7, a0
-; RV32I-NEXT: slli t0, t0, 1
-; RV32I-NEXT: srl t1, t1, a0
-; RV32I-NEXT: slli s2, t2, 1
-; RV32I-NEXT: sra t2, t2, a0
+; RV32I-NEXT: slli a5, a5, 1
+; RV32I-NEXT: srl t5, a7, a0
+; RV32I-NEXT: slli t6, t0, 1
+; RV32I-NEXT: srl a6, a6, a0
+; RV32I-NEXT: slli a7, a7, 1
+; RV32I-NEXT: srl s0, t1, a0
+; RV32I-NEXT: slli s1, t2, 1
+; RV32I-NEXT: srl t0, t0, a0
+; RV32I-NEXT: slli t1, t1, 1
+; RV32I-NEXT: srl t2, t2, a0
+; RV32I-NEXT: slli s2, a4, 1
+; RV32I-NEXT: sra s3, a4, a0
; RV32I-NEXT: sll a0, t4, a1
-; RV32I-NEXT: sll a4, a4, a1
-; RV32I-NEXT: sll t4, t6, a1
-; RV32I-NEXT: sll a6, a6, a1
-; RV32I-NEXT: sll t6, s1, a1
-; RV32I-NEXT: sll t0, t0, a1
-; RV32I-NEXT: sll s1, s2, a1
-; RV32I-NEXT: srli s2, t2, 24
-; RV32I-NEXT: srli s3, t2, 16
-; RV32I-NEXT: srli s4, t2, 8
+; RV32I-NEXT: sll a4, a5, a1
+; RV32I-NEXT: sll a5, t6, a1
+; RV32I-NEXT: sll a7, a7, a1
+; RV32I-NEXT: sll t4, s1, a1
+; RV32I-NEXT: sll t1, t1, a1
+; RV32I-NEXT: sll t6, s2, a1
+; RV32I-NEXT: srli s1, s3, 24
+; RV32I-NEXT: srli s2, s3, 16
+; RV32I-NEXT: srli s4, s3, 8
; RV32I-NEXT: or a0, t3, a0
; RV32I-NEXT: or a1, a3, a4
-; RV32I-NEXT: or a3, t5, t4
-; RV32I-NEXT: or a4, a5, a6
-; RV32I-NEXT: or a5, s0, t6
-; RV32I-NEXT: or a6, a7, t0
-; RV32I-NEXT: or a7, t1, s1
-; RV32I-NEXT: sb t2, 28(a2)
+; RV32I-NEXT: or a3, t5, a5
+; RV32I-NEXT: or a4, a6, a7
+; RV32I-NEXT: or a5, s0, t4
+; RV32I-NEXT: or a6, t0, t1
+; RV32I-NEXT: or a7, t2, t6
+; RV32I-NEXT: sb s3, 28(a2)
; RV32I-NEXT: sb s4, 29(a2)
-; RV32I-NEXT: sb s3, 30(a2)
-; RV32I-NEXT: sb s2, 31(a2)
+; RV32I-NEXT: sb s2, 30(a2)
+; RV32I-NEXT: sb s1, 31(a2)
; RV32I-NEXT: srli t0, a7, 24
; RV32I-NEXT: srli t1, a7, 16
; RV32I-NEXT: srli t2, a7, 8
diff --git a/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll b/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll
index 854d0b6..72242f1 100644
--- a/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll
+++ b/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+xandesbfhcvt -target-abi ilp32f \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,XANDESBFHCVT %s
+; RUN: llc -mtriple=riscv32 -mattr=+zfh,+xandesbfhcvt -target-abi ilp32f \
+; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,ZFH %s
; RUN: llc -mtriple=riscv64 -mattr=+xandesbfhcvt -target-abi lp64f \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,XANDESBFHCVT %s
+; RUN: llc -mtriple=riscv64 -mattr=+zfh,+xandesbfhcvt -target-abi lp64f \
+; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,ZFH %s
define float @fcvt_s_bf16(bfloat %a) nounwind {
; CHECK-LABEL: fcvt_s_bf16:
@@ -21,3 +25,40 @@ define bfloat @fcvt_bf16_s(float %a) nounwind {
%1 = fptrunc float %a to bfloat
ret bfloat %1
}
+
+; Check load and store to bf16.
+define void @loadstorebf16(ptr %bf, ptr %sf) nounwind {
+; XANDESBFHCVT-LABEL: loadstorebf16:
+; XANDESBFHCVT: # %bb.0: # %entry
+; XANDESBFHCVT-NEXT: lhu a2, 0(a0)
+; XANDESBFHCVT-NEXT: lui a3, 1048560
+; XANDESBFHCVT-NEXT: or a2, a2, a3
+; XANDESBFHCVT-NEXT: fmv.w.x fa5, a2
+; XANDESBFHCVT-NEXT: nds.fcvt.s.bf16 fa5, fa5
+; XANDESBFHCVT-NEXT: fsw fa5, 0(a1)
+; XANDESBFHCVT-NEXT: flw fa5, 0(a1)
+; XANDESBFHCVT-NEXT: nds.fcvt.bf16.s fa5, fa5
+; XANDESBFHCVT-NEXT: fmv.x.w a1, fa5
+; XANDESBFHCVT-NEXT: sh a1, 0(a0)
+; XANDESBFHCVT-NEXT: ret
+;
+; ZFH-LABEL: loadstorebf16:
+; ZFH: # %bb.0: # %entry
+; ZFH-NEXT: flh fa5, 0(a0)
+; ZFH-NEXT: nds.fcvt.s.bf16 fa5, fa5
+; ZFH-NEXT: fsw fa5, 0(a1)
+; ZFH-NEXT: flw fa5, 0(a1)
+; ZFH-NEXT: nds.fcvt.bf16.s fa5, fa5
+; ZFH-NEXT: fsh fa5, 0(a0)
+; ZFH-NEXT: ret
+entry:
+ %0 = load bfloat, bfloat* %bf, align 2
+ %1 = fpext bfloat %0 to float
+ store volatile float %1, float* %sf, align 4
+
+ %2 = load float, float* %sf, align 4
+ %3 = fptrunc float %2 to bfloat
+ store volatile bfloat %3, bfloat* %bf, align 2
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/xqciac.ll b/llvm/test/CodeGen/RISCV/xqciac.ll
index a3b4e78..6fdc63f 100644
--- a/llvm/test/CodeGen/RISCV/xqciac.ll
+++ b/llvm/test/CodeGen/RISCV/xqciac.ll
@@ -231,12 +231,12 @@ define dso_local i32 @pow2(i32 %a, i32 %b) local_unnamed_addr #0 {
;
; RV32IMXQCIAC-LABEL: pow2:
; RV32IMXQCIAC: # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 5
+; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 5
; RV32IMXQCIAC-NEXT: ret
;
; RV32IZBAMXQCIAC-LABEL: pow2:
; RV32IZBAMXQCIAC: # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 5
+; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 5
; RV32IZBAMXQCIAC-NEXT: ret
entry:
%mul = mul nsw i32 %b, 32
@@ -276,12 +276,12 @@ define dso_local i32 @shladd(i32 %a, i32 %b) local_unnamed_addr #0 {
;
; RV32IMXQCIAC-LABEL: shladd:
; RV32IMXQCIAC: # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 31
+; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 31
; RV32IMXQCIAC-NEXT: ret
;
; RV32IZBAMXQCIAC-LABEL: shladd:
; RV32IZBAMXQCIAC: # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 31
+; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 31
; RV32IZBAMXQCIAC-NEXT: ret
entry:
%shl = shl nsw i32 %b, 31
@@ -305,9 +305,9 @@ define dso_local i64 @shladd64(i64 %a, i64 %b) local_unnamed_addr #0 {
; RV32IMXQCIAC-LABEL: shladd64:
; RV32IMXQCIAC: # %bb.0: # %entry
; RV32IMXQCIAC-NEXT: srli a4, a2, 1
-; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a2, 31
+; RV32IMXQCIAC-NEXT: qc.shladd a0, a2, a0, 31
; RV32IMXQCIAC-NEXT: slli a2, a2, 31
-; RV32IMXQCIAC-NEXT: qc.shladd a3, a4, a3, 31
+; RV32IMXQCIAC-NEXT: qc.shladd a3, a3, a4, 31
; RV32IMXQCIAC-NEXT: sltu a2, a0, a2
; RV32IMXQCIAC-NEXT: add a1, a1, a3
; RV32IMXQCIAC-NEXT: add a1, a1, a2
@@ -316,9 +316,9 @@ define dso_local i64 @shladd64(i64 %a, i64 %b) local_unnamed_addr #0 {
; RV32IZBAMXQCIAC-LABEL: shladd64:
; RV32IZBAMXQCIAC: # %bb.0: # %entry
; RV32IZBAMXQCIAC-NEXT: srli a4, a2, 1
-; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a2, 31
+; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a2, a0, 31
; RV32IZBAMXQCIAC-NEXT: slli a2, a2, 31
-; RV32IZBAMXQCIAC-NEXT: qc.shladd a3, a4, a3, 31
+; RV32IZBAMXQCIAC-NEXT: qc.shladd a3, a3, a4, 31
; RV32IZBAMXQCIAC-NEXT: sltu a2, a0, a2
; RV32IZBAMXQCIAC-NEXT: add a1, a1, a3
; RV32IZBAMXQCIAC-NEXT: add a1, a1, a2
@@ -338,12 +338,12 @@ define dso_local i32 @shladd_ordisjoint(i32 %a, i32 %b) local_unnamed_addr #0 {
;
; RV32IMXQCIAC-LABEL: shladd_ordisjoint:
; RV32IMXQCIAC: # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 22
+; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 22
; RV32IMXQCIAC-NEXT: ret
;
; RV32IZBAMXQCIAC-LABEL: shladd_ordisjoint:
; RV32IZBAMXQCIAC: # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 22
+; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 22
; RV32IZBAMXQCIAC-NEXT: ret
entry:
%shl = shl nsw i32 %b, 22
@@ -361,13 +361,13 @@ define dso_local i32 @shladdc1c2(i32 %a, i32 %b) local_unnamed_addr #0 {
;
; RV32IMXQCIAC-LABEL: shladdc1c2:
; RV32IMXQCIAC: # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 5
+; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 5
; RV32IMXQCIAC-NEXT: slli a0, a0, 26
; RV32IMXQCIAC-NEXT: ret
;
; RV32IZBAMXQCIAC-LABEL: shladdc1c2:
; RV32IZBAMXQCIAC: # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 5
+; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 5
; RV32IZBAMXQCIAC-NEXT: slli a0, a0, 26
; RV32IZBAMXQCIAC-NEXT: ret
entry:
@@ -388,7 +388,7 @@ define dso_local i32 @shxaddc1c2(i32 %a, i32 %b) local_unnamed_addr #0 {
; RV32IMXQCIAC-LABEL: shxaddc1c2:
; RV32IMXQCIAC: # %bb.0: # %entry
; RV32IMXQCIAC-NEXT: slli a1, a1, 28
-; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 31
+; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 31
; RV32IMXQCIAC-NEXT: ret
;
; RV32IZBAMXQCIAC-LABEL: shxaddc1c2:
@@ -417,18 +417,18 @@ define dso_local i64 @shladdc1c264(i64 %a, i64 %b) local_unnamed_addr #0 {
; RV32IMXQCIAC-LABEL: shladdc1c264:
; RV32IMXQCIAC: # %bb.0: # %entry
; RV32IMXQCIAC-NEXT: srli a1, a2, 12
-; RV32IMXQCIAC-NEXT: qc.shladd a1, a1, a3, 20
+; RV32IMXQCIAC-NEXT: qc.shladd a1, a3, a1, 20
; RV32IMXQCIAC-NEXT: slli a2, a2, 20
-; RV32IMXQCIAC-NEXT: qc.shladd a1, a1, a0, 23
+; RV32IMXQCIAC-NEXT: qc.shladd a1, a0, a1, 23
; RV32IMXQCIAC-NEXT: mv a0, a2
; RV32IMXQCIAC-NEXT: ret
;
; RV32IZBAMXQCIAC-LABEL: shladdc1c264:
; RV32IZBAMXQCIAC: # %bb.0: # %entry
; RV32IZBAMXQCIAC-NEXT: srli a1, a2, 12
-; RV32IZBAMXQCIAC-NEXT: qc.shladd a1, a1, a3, 20
+; RV32IZBAMXQCIAC-NEXT: qc.shladd a1, a3, a1, 20
; RV32IZBAMXQCIAC-NEXT: slli a2, a2, 20
-; RV32IZBAMXQCIAC-NEXT: qc.shladd a1, a1, a0, 23
+; RV32IZBAMXQCIAC-NEXT: qc.shladd a1, a0, a1, 23
; RV32IZBAMXQCIAC-NEXT: mv a0, a2
; RV32IZBAMXQCIAC-NEXT: ret
entry:
@@ -449,13 +449,13 @@ define dso_local i32 @shladdc1equalc2(i32 %a, i32 %b) local_unnamed_addr #0 {
; RV32IMXQCIAC-LABEL: shladdc1equalc2:
; RV32IMXQCIAC: # %bb.0: # %entry
; RV32IMXQCIAC-NEXT: slli a1, a1, 12
-; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 12
+; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 12
; RV32IMXQCIAC-NEXT: ret
;
; RV32IZBAMXQCIAC-LABEL: shladdc1equalc2:
; RV32IZBAMXQCIAC: # %bb.0: # %entry
; RV32IZBAMXQCIAC-NEXT: slli a1, a1, 12
-; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 12
+; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 12
; RV32IZBAMXQCIAC-NEXT: ret
entry:
%shlc1 = shl nsw i32 %a, 12
@@ -463,3 +463,30 @@ entry:
%add = add nsw i32 %shlc1, %shlc2
ret i32 %add
}
+
+define i32 @testmuliaddnegimm(i32 %a) {
+; RV32IM-LABEL: testmuliaddnegimm:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: slli a1, a0, 1
+; RV32IM-NEXT: add a0, a1, a0
+; RV32IM-NEXT: li a1, 3
+; RV32IM-NEXT: sub a0, a1, a0
+; RV32IM-NEXT: ret
+;
+; RV32IMXQCIAC-LABEL: testmuliaddnegimm:
+; RV32IMXQCIAC: # %bb.0:
+; RV32IMXQCIAC-NEXT: li a1, 3
+; RV32IMXQCIAC-NEXT: qc.muliadd a1, a0, -3
+; RV32IMXQCIAC-NEXT: mv a0, a1
+; RV32IMXQCIAC-NEXT: ret
+;
+; RV32IZBAMXQCIAC-LABEL: testmuliaddnegimm:
+; RV32IZBAMXQCIAC: # %bb.0:
+; RV32IZBAMXQCIAC-NEXT: li a1, 3
+; RV32IZBAMXQCIAC-NEXT: qc.muliadd a1, a0, -3
+; RV32IZBAMXQCIAC-NEXT: mv a0, a1
+; RV32IZBAMXQCIAC-NEXT: ret
+ %mul = mul i32 %a, -3
+ %add = add i32 %mul, 3
+ ret i32 %add
+}
diff --git a/llvm/test/CodeGen/RISCV/xqcisls.ll b/llvm/test/CodeGen/RISCV/xqcisls.ll
index 709dc4c..3dea540 100644
--- a/llvm/test/CodeGen/RISCV/xqcisls.ll
+++ b/llvm/test/CodeGen/RISCV/xqcisls.ll
@@ -308,13 +308,13 @@ define i64 @lrd(ptr %a, i32 %b) {
;
; RV32IZBAXQCISLS-LABEL: lrd:
; RV32IZBAXQCISLS: # %bb.0:
-; RV32IZBAXQCISLS-NEXT: qc.lrw a2, a0, a1, 3
-; RV32IZBAXQCISLS-NEXT: addi a0, a0, 4
-; RV32IZBAXQCISLS-NEXT: qc.lrw a1, a0, a1, 3
-; RV32IZBAXQCISLS-NEXT: add a0, a2, a2
-; RV32IZBAXQCISLS-NEXT: sltu a2, a0, a2
-; RV32IZBAXQCISLS-NEXT: add a1, a1, a1
-; RV32IZBAXQCISLS-NEXT: add a1, a1, a2
+; RV32IZBAXQCISLS-NEXT: sh3add a0, a1, a0
+; RV32IZBAXQCISLS-NEXT: lw a1, 0(a0)
+; RV32IZBAXQCISLS-NEXT: lw a2, 4(a0)
+; RV32IZBAXQCISLS-NEXT: add a0, a1, a1
+; RV32IZBAXQCISLS-NEXT: sltu a1, a0, a1
+; RV32IZBAXQCISLS-NEXT: add a2, a2, a2
+; RV32IZBAXQCISLS-NEXT: add a1, a2, a1
; RV32IZBAXQCISLS-NEXT: ret
%1 = getelementptr i64, ptr %a, i32 %b
%2 = load i64, ptr %1, align 8
@@ -348,14 +348,13 @@ define i64 @lrd_2(ptr %a, i32 %b) {
;
; RV32IZBAXQCISLS-LABEL: lrd_2:
; RV32IZBAXQCISLS: # %bb.0:
-; RV32IZBAXQCISLS-NEXT: addi a2, a0, 96
-; RV32IZBAXQCISLS-NEXT: qc.lrw a2, a2, a1, 3
-; RV32IZBAXQCISLS-NEXT: addi a0, a0, 100
-; RV32IZBAXQCISLS-NEXT: qc.lrw a1, a0, a1, 3
-; RV32IZBAXQCISLS-NEXT: add a0, a2, a2
-; RV32IZBAXQCISLS-NEXT: sltu a2, a0, a2
-; RV32IZBAXQCISLS-NEXT: add a1, a1, a1
-; RV32IZBAXQCISLS-NEXT: add a1, a1, a2
+; RV32IZBAXQCISLS-NEXT: sh3add a0, a1, a0
+; RV32IZBAXQCISLS-NEXT: lw a1, 96(a0)
+; RV32IZBAXQCISLS-NEXT: lw a2, 100(a0)
+; RV32IZBAXQCISLS-NEXT: add a0, a1, a1
+; RV32IZBAXQCISLS-NEXT: sltu a1, a0, a1
+; RV32IZBAXQCISLS-NEXT: add a2, a2, a2
+; RV32IZBAXQCISLS-NEXT: add a1, a2, a1
; RV32IZBAXQCISLS-NEXT: ret
%1 = add i32 %b, 12
%2 = getelementptr i64, ptr %a, i32 %1
@@ -472,11 +471,11 @@ define void @srd(ptr %a, i32 %b, i64 %c) {
; RV32IZBAXQCISLS: # %bb.0:
; RV32IZBAXQCISLS-NEXT: add a4, a2, a2
; RV32IZBAXQCISLS-NEXT: add a3, a3, a3
-; RV32IZBAXQCISLS-NEXT: sltu a2, a4, a2
-; RV32IZBAXQCISLS-NEXT: qc.srw a4, a0, a1, 3
-; RV32IZBAXQCISLS-NEXT: add a2, a3, a2
-; RV32IZBAXQCISLS-NEXT: addi a0, a0, 4
-; RV32IZBAXQCISLS-NEXT: qc.srw a2, a0, a1, 3
+; RV32IZBAXQCISLS-NEXT: sh3add a0, a1, a0
+; RV32IZBAXQCISLS-NEXT: sltu a1, a4, a2
+; RV32IZBAXQCISLS-NEXT: add a1, a3, a1
+; RV32IZBAXQCISLS-NEXT: sw a4, 0(a0)
+; RV32IZBAXQCISLS-NEXT: sw a1, 4(a0)
; RV32IZBAXQCISLS-NEXT: ret
%1 = add i64 %c, %c
%2 = getelementptr i64, ptr %a, i32 %b
@@ -503,10 +502,10 @@ define i64 @lrd_large_shift(ptr %a, i32 %b) {
;
; RV32IZBAXQCISLS-LABEL: lrd_large_shift:
; RV32IZBAXQCISLS: # %bb.0:
-; RV32IZBAXQCISLS-NEXT: addi a2, a0, 384
-; RV32IZBAXQCISLS-NEXT: addi a3, a0, 388
-; RV32IZBAXQCISLS-NEXT: qc.lrw a0, a2, a1, 5
-; RV32IZBAXQCISLS-NEXT: qc.lrw a1, a3, a1, 5
+; RV32IZBAXQCISLS-NEXT: slli a1, a1, 5
+; RV32IZBAXQCISLS-NEXT: add a1, a1, a0
+; RV32IZBAXQCISLS-NEXT: lw a0, 384(a1)
+; RV32IZBAXQCISLS-NEXT: lw a1, 388(a1)
; RV32IZBAXQCISLS-NEXT: ret
%1 = add i32 %b, 12
%2 = shl i32 %1, 2
diff --git a/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll
index cdaae23..5724c4f 100644
--- a/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll
@@ -1,33 +1,27 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d -mattr=+xtheadfmemidx -mattr=+m -verify-machineinstrs < %s \
-; RUN: | FileCheck %s -check-prefix=RV32XTHEADMEMIDX
-; RUN: llc -mtriple=riscv64 -mattr=+d -mattr=+xtheadfmemidx -verify-machineinstrs < %s \
-; RUN: | FileCheck %s -check-prefix=RV64XTHEADFMEMIDX
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+d,+xtheadfmemidx \
+; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32XTHEADFMEMIDX
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+d,+xtheadfmemidx \
+; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64XTHEADFMEMIDX
-define float @flrw(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: flrw:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.flrw fa5, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT: fadd.s fa0, fa5, fa5
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADFMEMIDX-LABEL: flrw:
-; RV64XTHEADFMEMIDX: # %bb.0:
-; RV64XTHEADFMEMIDX-NEXT: th.flrw fa5, a0, a1, 2
-; RV64XTHEADFMEMIDX-NEXT: fadd.s fa0, fa5, fa5
-; RV64XTHEADFMEMIDX-NEXT: ret
- %1 = getelementptr float, ptr %a, i64 %b
+define float @flrw(ptr %a, iXLen %b) {
+; CHECK-LABEL: flrw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.flrw fa5, a0, a1, 2
+; CHECK-NEXT: fadd.s fa0, fa5, fa5
+; CHECK-NEXT: ret
+ %1 = getelementptr float, ptr %a, iXLen %b
%2 = load float, ptr %1, align 4
%3 = fadd float %2, %2
ret float %3
}
define float @flurw(ptr %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: flurw:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.flrw fa5, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT: fadd.s fa0, fa5, fa5
-; RV32XTHEADMEMIDX-NEXT: ret
+; RV32XTHEADFMEMIDX-LABEL: flurw:
+; RV32XTHEADFMEMIDX: # %bb.0:
+; RV32XTHEADFMEMIDX-NEXT: th.flrw fa5, a0, a1, 2
+; RV32XTHEADFMEMIDX-NEXT: fadd.s fa0, fa5, fa5
+; RV32XTHEADFMEMIDX-NEXT: ret
;
; RV64XTHEADFMEMIDX-LABEL: flurw:
; RV64XTHEADFMEMIDX: # %bb.0:
@@ -41,30 +35,24 @@ define float @flurw(ptr %a, i32 %b) {
ret float %4
}
-define void @fsrw(ptr %a, i64 %b, float %c) {
-; RV32XTHEADMEMIDX-LABEL: fsrw:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: fadd.s fa5, fa0, fa0
-; RV32XTHEADMEMIDX-NEXT: th.fsrw fa5, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADFMEMIDX-LABEL: fsrw:
-; RV64XTHEADFMEMIDX: # %bb.0:
-; RV64XTHEADFMEMIDX-NEXT: fadd.s fa5, fa0, fa0
-; RV64XTHEADFMEMIDX-NEXT: th.fsrw fa5, a0, a1, 2
-; RV64XTHEADFMEMIDX-NEXT: ret
+define void @fsrw(ptr %a, iXLen %b, float %c) {
+; CHECK-LABEL: fsrw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fadd.s fa5, fa0, fa0
+; CHECK-NEXT: th.fsrw fa5, a0, a1, 2
+; CHECK-NEXT: ret
%1 = fadd float %c, %c
- %2 = getelementptr float, ptr %a, i64 %b
+ %2 = getelementptr float, ptr %a, iXLen %b
store float %1, ptr %2, align 4
ret void
}
define void @fsurw(ptr %a, i32 %b, float %c) {
-; RV32XTHEADMEMIDX-LABEL: fsurw:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: fadd.s fa5, fa0, fa0
-; RV32XTHEADMEMIDX-NEXT: th.fsrw fa5, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT: ret
+; RV32XTHEADFMEMIDX-LABEL: fsurw:
+; RV32XTHEADFMEMIDX: # %bb.0:
+; RV32XTHEADFMEMIDX-NEXT: fadd.s fa5, fa0, fa0
+; RV32XTHEADFMEMIDX-NEXT: th.fsrw fa5, a0, a1, 2
+; RV32XTHEADFMEMIDX-NEXT: ret
;
; RV64XTHEADFMEMIDX-LABEL: fsurw:
; RV64XTHEADFMEMIDX: # %bb.0:
@@ -78,30 +66,24 @@ define void @fsurw(ptr %a, i32 %b, float %c) {
ret void
}
-define double @flrd(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: flrd:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.flrd fa5, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT: fadd.d fa0, fa5, fa5
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADFMEMIDX-LABEL: flrd:
-; RV64XTHEADFMEMIDX: # %bb.0:
-; RV64XTHEADFMEMIDX-NEXT: th.flrd fa5, a0, a1, 3
-; RV64XTHEADFMEMIDX-NEXT: fadd.d fa0, fa5, fa5
-; RV64XTHEADFMEMIDX-NEXT: ret
- %1 = getelementptr double, ptr %a, i64 %b
+define double @flrd(ptr %a, iXLen %b) {
+; CHECK-LABEL: flrd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.flrd fa5, a0, a1, 3
+; CHECK-NEXT: fadd.d fa0, fa5, fa5
+; CHECK-NEXT: ret
+ %1 = getelementptr double, ptr %a, iXLen %b
%2 = load double, ptr %1, align 8
%3 = fadd double %2, %2
ret double %3
}
define double @flurd(ptr %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: flurd:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.flrd fa5, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT: fadd.d fa0, fa5, fa5
-; RV32XTHEADMEMIDX-NEXT: ret
+; RV32XTHEADFMEMIDX-LABEL: flurd:
+; RV32XTHEADFMEMIDX: # %bb.0:
+; RV32XTHEADFMEMIDX-NEXT: th.flrd fa5, a0, a1, 3
+; RV32XTHEADFMEMIDX-NEXT: fadd.d fa0, fa5, fa5
+; RV32XTHEADFMEMIDX-NEXT: ret
;
; RV64XTHEADFMEMIDX-LABEL: flurd:
; RV64XTHEADFMEMIDX: # %bb.0:
@@ -115,30 +97,24 @@ define double @flurd(ptr %a, i32 %b) {
ret double %4
}
-define void @fsrd(ptr %a, i64 %b, double %c) {
-; RV32XTHEADMEMIDX-LABEL: fsrd:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: fadd.d fa5, fa0, fa0
-; RV32XTHEADMEMIDX-NEXT: th.fsrd fa5, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADFMEMIDX-LABEL: fsrd:
-; RV64XTHEADFMEMIDX: # %bb.0:
-; RV64XTHEADFMEMIDX-NEXT: fadd.d fa5, fa0, fa0
-; RV64XTHEADFMEMIDX-NEXT: th.fsrd fa5, a0, a1, 3
-; RV64XTHEADFMEMIDX-NEXT: ret
+define void @fsrd(ptr %a, iXLen %b, double %c) {
+; CHECK-LABEL: fsrd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fadd.d fa5, fa0, fa0
+; CHECK-NEXT: th.fsrd fa5, a0, a1, 3
+; CHECK-NEXT: ret
%1 = fadd double %c, %c
- %2 = getelementptr double, ptr %a, i64 %b
+ %2 = getelementptr double, ptr %a, iXLen %b
store double %1, ptr %2, align 8
ret void
}
define void @fsurd(ptr %a, i32 %b, double %c) {
-; RV32XTHEADMEMIDX-LABEL: fsurd:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: fadd.d fa5, fa0, fa0
-; RV32XTHEADMEMIDX-NEXT: th.fsrd fa5, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT: ret
+; RV32XTHEADFMEMIDX-LABEL: fsurd:
+; RV32XTHEADFMEMIDX: # %bb.0:
+; RV32XTHEADFMEMIDX-NEXT: fadd.d fa5, fa0, fa0
+; RV32XTHEADFMEMIDX-NEXT: th.fsrd fa5, a0, a1, 3
+; RV32XTHEADFMEMIDX-NEXT: ret
;
; RV64XTHEADFMEMIDX-LABEL: fsurd:
; RV64XTHEADFMEMIDX: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
index fc20fcb..9f0f8d9 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
@@ -1,238 +1,156 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d -mattr=+xtheadmemidx -mattr=+m -verify-machineinstrs < %s \
-; RUN: | FileCheck %s -check-prefix=RV32XTHEADMEMIDX
-; RUN: llc -mtriple=riscv64 -mattr=+d -mattr=+xtheadmemidx -mattr=+m -verify-machineinstrs < %s \
-; RUN: | FileCheck %s -check-prefix=RV64XTHEADMEMIDX
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+d,+xtheadmemidx \
+; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32XTHEADMEMIDX
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+d,+xtheadmemidx \
+; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64XTHEADMEMIDX
define ptr @lbia(ptr %base, ptr %addr.2, i8 %a) {
-; RV32XTHEADMEMIDX-LABEL: lbia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lbia a3, (a0), -1, 0
-; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV32XTHEADMEMIDX-NEXT: sb a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lbia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lbia a3, (a0), -1, 0
-; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT: sb a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i8, ptr %base, i8 0
+; CHECK-LABEL: lbia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lbia a3, (a0), -1, 0
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: sb a2, 0(a1)
+; CHECK-NEXT: ret
+ %addr = getelementptr i8, ptr %base, iXLen 0
%ld = load i8, ptr %addr
- %addr.1 = getelementptr i8, ptr %base, i8 -1
+ %addr.1 = getelementptr i8, ptr %base, iXLen -1
%res = add i8 %ld, %a
store i8 %res, ptr %addr.2
ret ptr %addr.1
}
define ptr @lbib(ptr %base, i8 %a) {
-; RV32XTHEADMEMIDX-LABEL: lbib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lbib a2, (a0), 1, 0
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT: sb a1, 1(a0)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lbib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lbib a2, (a0), 1, 0
-; RV64XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV64XTHEADMEMIDX-NEXT: sb a1, 1(a0)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i8, ptr %base, i8 1
+; CHECK-LABEL: lbib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lbib a2, (a0), 1, 0
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: sb a1, 1(a0)
+; CHECK-NEXT: ret
+ %addr = getelementptr i8, ptr %base, iXLen 1
%ld = load i8, ptr %addr
- %addr.1 = getelementptr i8, ptr %base, i8 2
+ %addr.1 = getelementptr i8, ptr %base, iXLen 2
%res = add i8 %ld, %a
store i8 %res, ptr %addr.1
ret ptr %addr
}
-define ptr @lbuia(ptr %base, ptr %addr.2, i64 %a) {
-; RV32XTHEADMEMIDX-LABEL: lbuia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lbuia a4, (a0), -1, 0
-; RV32XTHEADMEMIDX-NEXT: add a2, a4, a2
-; RV32XTHEADMEMIDX-NEXT: sltu a4, a2, a4
-; RV32XTHEADMEMIDX-NEXT: add a3, a3, a4
-; RV32XTHEADMEMIDX-NEXT: sw a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT: sw a3, 4(a1)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lbuia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lbuia a3, (a0), -1, 0
-; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT: sd a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i8, ptr %base, i8 0
+define ptr @lbuia(ptr %base, ptr %addr.2, i32 %a) {
+; CHECK-LABEL: lbuia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lbuia a3, (a0), -1, 0
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: sw a2, 0(a1)
+; CHECK-NEXT: ret
+ %addr = getelementptr i8, ptr %base, iXLen 0
%ld = load i8, ptr %addr
- %zext = zext i8 %ld to i64
- %addr.1 = getelementptr i8, ptr %base, i8 -1
- %res = add i64 %zext, %a
- store i64 %res, ptr %addr.2
+ %zext = zext i8 %ld to i32
+ %addr.1 = getelementptr i8, ptr %base, iXLen -1
+ %res = add i32 %zext, %a
+ store i32 %res, ptr %addr.2
ret ptr %addr.1
}
-define ptr @lbuib(ptr %base, i64 %a, ptr %addr.1) {
-; RV32XTHEADMEMIDX-LABEL: lbuib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lbuib a4, (a0), 1, 0
-; RV32XTHEADMEMIDX-NEXT: add a1, a4, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a4, a1, a4
-; RV32XTHEADMEMIDX-NEXT: add a2, a2, a4
-; RV32XTHEADMEMIDX-NEXT: sw a1, 0(a3)
-; RV32XTHEADMEMIDX-NEXT: sw a2, 4(a3)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lbuib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lbuib a3, (a0), 1, 0
-; RV64XTHEADMEMIDX-NEXT: add a1, a3, a1
-; RV64XTHEADMEMIDX-NEXT: sd a1, 0(a2)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i8, ptr %base, i8 1
+define ptr @lbuib(ptr %base, i32 %a, ptr %addr.1) {
+; CHECK-LABEL: lbuib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lbuib a3, (a0), 1, 0
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: sw a1, 0(a2)
+; CHECK-NEXT: ret
+ %addr = getelementptr i8, ptr %base, iXLen 1
%ld = load i8, ptr %addr
- %zext = zext i8 %ld to i64
- %res = add i64 %zext, %a
- store i64 %res, ptr %addr.1
+ %zext = zext i8 %ld to i32
+ %res = add i32 %zext, %a
+ store i32 %res, ptr %addr.1
ret ptr %addr
}
define ptr @lhia(ptr %base, ptr %addr.2, i16 %a) {
-; RV32XTHEADMEMIDX-LABEL: lhia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lhia a3, (a0), -16, 1
-; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV32XTHEADMEMIDX-NEXT: sh a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lhia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lhia a3, (a0), -16, 1
-; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT: sh a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i16, ptr %base, i16 0
+; CHECK-LABEL: lhia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lhia a3, (a0), -16, 1
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: sh a2, 0(a1)
+; CHECK-NEXT: ret
+ %addr = getelementptr i16, ptr %base, iXLen 0
%ld = load i16, ptr %addr
- %addr.1 = getelementptr i16, ptr %base, i16 -16
+ %addr.1 = getelementptr i16, ptr %base, iXLen -16
%res = add i16 %ld, %a
store i16 %res, ptr %addr.2
ret ptr %addr.1
}
define ptr @lhib(ptr %base, i16 %a) {
-; RV32XTHEADMEMIDX-LABEL: lhib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lhib a2, (a0), 2, 0
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT: sh a1, 2(a0)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lhib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lhib a2, (a0), 2, 0
-; RV64XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV64XTHEADMEMIDX-NEXT: sh a1, 2(a0)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i16, ptr %base, i16 1
+; CHECK-LABEL: lhib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lhib a2, (a0), 2, 0
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: sh a1, 2(a0)
+; CHECK-NEXT: ret
+ %addr = getelementptr i16, ptr %base, iXLen 1
%ld = load i16, ptr %addr
- %addr.1 = getelementptr i16, ptr %base, i16 2
+ %addr.1 = getelementptr i16, ptr %base, iXLen 2
%res = add i16 %ld, %a
store i16 %res, ptr %addr.1
ret ptr %addr
}
-define ptr @lhuia(ptr %base, ptr %addr.2, i64 %a) {
-; RV32XTHEADMEMIDX-LABEL: lhuia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lhuia a4, (a0), -16, 1
-; RV32XTHEADMEMIDX-NEXT: add a2, a4, a2
-; RV32XTHEADMEMIDX-NEXT: sltu a4, a2, a4
-; RV32XTHEADMEMIDX-NEXT: add a3, a3, a4
-; RV32XTHEADMEMIDX-NEXT: sw a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT: sw a3, 4(a1)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lhuia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lhuia a3, (a0), -16, 1
-; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT: sd a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i16, ptr %base, i16 0
+define ptr @lhuia(ptr %base, ptr %addr.2, i32 %a) {
+; CHECK-LABEL: lhuia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lhuia a3, (a0), -16, 1
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: sw a2, 0(a1)
+; CHECK-NEXT: ret
+ %addr = getelementptr i16, ptr %base, iXLen 0
%ld = load i16, ptr %addr
- %zext = zext i16 %ld to i64
- %addr.1 = getelementptr i16, ptr %base, i16 -16
- %res = add i64 %zext, %a
- store i64 %res, ptr %addr.2
+ %zext = zext i16 %ld to i32
+ %addr.1 = getelementptr i16, ptr %base, iXLen -16
+ %res = add i32 %zext, %a
+ store i32 %res, ptr %addr.2
ret ptr %addr.1
}
-define ptr @lhuib(ptr %base, i64 %a, ptr %addr.1) {
-; RV32XTHEADMEMIDX-LABEL: lhuib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lhuib a4, (a0), 2, 0
-; RV32XTHEADMEMIDX-NEXT: add a1, a4, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a4, a1, a4
-; RV32XTHEADMEMIDX-NEXT: add a2, a2, a4
-; RV32XTHEADMEMIDX-NEXT: sw a1, 0(a3)
-; RV32XTHEADMEMIDX-NEXT: sw a2, 4(a3)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lhuib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lhuib a3, (a0), 2, 0
-; RV64XTHEADMEMIDX-NEXT: add a1, a3, a1
-; RV64XTHEADMEMIDX-NEXT: sd a1, 0(a2)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i16, ptr %base, i16 1
+define ptr @lhuib(ptr %base, i32 %a, ptr %addr.1) {
+; CHECK-LABEL: lhuib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lhuib a3, (a0), 2, 0
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: sw a1, 0(a2)
+; CHECK-NEXT: ret
+ %addr = getelementptr i16, ptr %base, iXLen 1
%ld = load i16, ptr %addr
- %zext = zext i16 %ld to i64
- %res = add i64 %zext, %a
- store i64 %res, ptr %addr.1
+ %zext = zext i16 %ld to i32
+ %res = add i32 %zext, %a
+ store i32 %res, ptr %addr.1
ret ptr %addr
}
define ptr @lwia(ptr %base, ptr %addr.2, i32 %a) {
-; RV32XTHEADMEMIDX-LABEL: lwia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lwia a3, (a0), -16, 2
-; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV32XTHEADMEMIDX-NEXT: sw a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lwia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lwia a3, (a0), -16, 2
-; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT: sw a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i32, ptr %base, i32 0
+; CHECK-LABEL: lwia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lwia a3, (a0), -16, 2
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: sw a2, 0(a1)
+; CHECK-NEXT: ret
+ %addr = getelementptr i32, ptr %base, iXLen 0
%ld = load i32, ptr %addr
- %addr.1 = getelementptr i32, ptr %base, i32 -16
+ %addr.1 = getelementptr i32, ptr %base, iXLen -16
%res = add i32 %ld, %a
store i32 %res, ptr %addr.2
ret ptr %addr.1
}
define ptr @lwib(ptr %base, i32 %a) {
-; RV32XTHEADMEMIDX-LABEL: lwib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lwib a2, (a0), 4, 0
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT: sw a1, 4(a0)
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lwib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lwib a2, (a0), 4, 0
-; RV64XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV64XTHEADMEMIDX-NEXT: sw a1, 4(a0)
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i32, ptr %base, i32 1
+; CHECK-LABEL: lwib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lwib a2, (a0), 4, 0
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: sw a1, 4(a0)
+; CHECK-NEXT: ret
+ %addr = getelementptr i32, ptr %base, iXLen 1
%ld = load i32, ptr %addr
- %addr.1 = getelementptr i32, ptr %base, i32 2
+ %addr.1 = getelementptr i32, ptr %base, iXLen 2
%res = add i32 %ld, %a
store i32 %res, ptr %addr.1
ret ptr %addr
@@ -255,10 +173,10 @@ define ptr @lwuia(ptr %base, ptr %addr.2, i64 %a) {
; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
; RV64XTHEADMEMIDX-NEXT: sd a2, 0(a1)
; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i32, ptr %base, i32 0
+ %addr = getelementptr i32, ptr %base, iXLen 0
%ld = load i32, ptr %addr
%zext = zext i32 %ld to i64
- %addr.1 = getelementptr i32, ptr %base, i32 -16
+ %addr.1 = getelementptr i32, ptr %base, iXLen -16
%res = add i64 %zext, %a
store i64 %res, ptr %addr.2
ret ptr %addr.1
@@ -281,7 +199,7 @@ define ptr @lwuib(ptr %base, i64 %a, ptr %addr.1) {
; RV64XTHEADMEMIDX-NEXT: add a1, a3, a1
; RV64XTHEADMEMIDX-NEXT: sd a1, 0(a2)
; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i32, ptr %base, i32 1
+ %addr = getelementptr i32, ptr %base, iXLen 1
%ld = load i32, ptr %addr
%zext = zext i32 %ld to i64
%res = add i64 %zext, %a
@@ -309,9 +227,9 @@ define ptr @ldia(ptr %base, ptr %addr.2, i64 %a) {
; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2
; RV64XTHEADMEMIDX-NEXT: sd a2, 0(a1)
; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i64, ptr %base, i64 0
+ %addr = getelementptr i64, ptr %base, iXLen 0
%ld = load i64, ptr %addr
- %addr.1 = getelementptr i64, ptr %base, i64 -16
+ %addr.1 = getelementptr i64, ptr %base, iXLen -16
%res = add i64 %ld, %a
store i64 %res, ptr %addr.2
ret ptr %addr.1
@@ -336,117 +254,81 @@ define ptr @ldib(ptr %base, i64 %a) {
; RV64XTHEADMEMIDX-NEXT: add a1, a2, a1
; RV64XTHEADMEMIDX-NEXT: sd a1, 8(a0)
; RV64XTHEADMEMIDX-NEXT: ret
- %addr = getelementptr i64, ptr %base, i64 1
+ %addr = getelementptr i64, ptr %base, iXLen 1
%ld = load i64, ptr %addr
- %addr.1 = getelementptr i64, ptr %base, i64 2
+ %addr.1 = getelementptr i64, ptr %base, iXLen 2
%res = add i64 %ld, %a
store i64 %res, ptr %addr.1
ret ptr %addr
}
define ptr @sbia(ptr %base, i8 %a, i8 %b) {
-; RV32XTHEADMEMIDX-LABEL: sbia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.sbia a1, (a0), 1, 0
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: sbia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.sbia a1, (a0), 1, 0
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i8, ptr %base, i8 1
+; CHECK-LABEL: sbia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.sbia a1, (a0), 1, 0
+; CHECK-NEXT: ret
+ %addr.1 = getelementptr i8, ptr %base, iXLen 1
%res = add i8 %a, %b
store i8 %res, ptr %base
ret ptr %addr.1
}
define ptr @sbib(ptr %base, i8 %a, i8 %b) {
-; RV32XTHEADMEMIDX-LABEL: sbib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.sbib a1, (a0), 1, 0
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: sbib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.sbib a1, (a0), 1, 0
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i8, ptr %base, i8 1
+; CHECK-LABEL: sbib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.sbib a1, (a0), 1, 0
+; CHECK-NEXT: ret
+ %addr.1 = getelementptr i8, ptr %base, iXLen 1
%res = add i8 %a, %b
store i8 %res, ptr %addr.1
ret ptr %addr.1
}
define ptr @shia(ptr %base, i16 %a, i16 %b) {
-; RV32XTHEADMEMIDX-LABEL: shia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.shia a1, (a0), -9, 1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: shia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.shia a1, (a0), -9, 1
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i16, ptr %base, i16 -9
+; CHECK-LABEL: shia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.shia a1, (a0), -9, 1
+; CHECK-NEXT: ret
+ %addr.1 = getelementptr i16, ptr %base, iXLen -9
%res = add i16 %a, %b
store i16 %res, ptr %base
ret ptr %addr.1
}
define ptr @shib(ptr %base, i16 %a, i16 %b) {
-; RV32XTHEADMEMIDX-LABEL: shib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.shib a1, (a0), 2, 0
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: shib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.shib a1, (a0), 2, 0
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i16, ptr %base, i16 1
+; CHECK-LABEL: shib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.shib a1, (a0), 2, 0
+; CHECK-NEXT: ret
+ %addr.1 = getelementptr i16, ptr %base, iXLen 1
%res = add i16 %a, %b
store i16 %res, ptr %addr.1
ret ptr %addr.1
}
define ptr @swia(ptr %base, i32 %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: swia:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.swia a1, (a0), 8, 2
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: swia:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.swia a1, (a0), 8, 2
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i32, ptr %base, i32 8
+; CHECK-LABEL: swia:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.swia a1, (a0), 8, 2
+; CHECK-NEXT: ret
+ %addr.1 = getelementptr i32, ptr %base, iXLen 8
%res = add i32 %a, %b
store i32 %res, ptr %base
ret ptr %addr.1
}
define ptr @swib(ptr %base, i32 %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: swib:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.swib a1, (a0), -13, 3
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: swib:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.swib a1, (a0), -13, 3
-; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i32, ptr %base, i32 -26
+; CHECK-LABEL: swib:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.swib a1, (a0), -13, 3
+; CHECK-NEXT: ret
+ %addr.1 = getelementptr i32, ptr %base, iXLen -26
%res = add i32 %a, %b
store i32 %res, ptr %addr.1
ret ptr %addr.1
@@ -470,7 +352,7 @@ define ptr @sdia(ptr %base, i64 %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
; RV64XTHEADMEMIDX-NEXT: th.sdia a1, (a0), 8, 3
; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i64, ptr %base, i64 8
+ %addr.1 = getelementptr i64, ptr %base, iXLen 8
%res = add i64 %a, %b
store i64 %res, ptr %base
ret ptr %addr.1
@@ -492,48 +374,33 @@ define ptr @sdib(ptr %base, i64 %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
; RV64XTHEADMEMIDX-NEXT: th.sdib a1, (a0), 8, 0
; RV64XTHEADMEMIDX-NEXT: ret
- %addr.1 = getelementptr i64, ptr %base, i64 1
+ %addr.1 = getelementptr i64, ptr %base, iXLen 1
%res = add i64 %a, %b
store i64 %res, ptr %addr.1
ret ptr %addr.1
}
-define i8 @lrb_anyext(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrb_anyext:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrb a0, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrb_anyext:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrb a0, a0, a1, 0
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i8, ptr %a, i64 %b
+define i8 @lrb_anyext(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrb_anyext:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrb a0, a0, a1, 0
+; CHECK-NEXT: ret
+ %1 = getelementptr i8, ptr %a, iXLen %b
%2 = load i8, ptr %1, align 1
ret i8 %2
}
-define i64 @lrb(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrb:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrb a1, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT: srai a2, a1, 31
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrb:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrb a0, a0, a1, 0
-; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i8, ptr %a, i64 %b
+define i32 @lrb(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrb a0, a0, a1, 0
+; CHECK-NEXT: add a0, a0, a0
+; CHECK-NEXT: ret
+ %1 = getelementptr i8, ptr %a, iXLen %b
%2 = load i8, ptr %1, align 1
- %3 = sext i8 %2 to i64
- %4 = add i64 %3, %3
- ret i64 %4
+ %3 = sext i8 %2 to i32
+ %4 = add i32 %3, %3
+ ret i32 %4
}
define i8 @lurb_anyext(ptr %a, i32 %b) {
@@ -552,15 +419,11 @@ define i8 @lurb_anyext(ptr %a, i32 %b) {
ret i8 %3
}
-define i64 @lurb(ptr %a, i32 %b) {
+define i32 @lurb(ptr %a, i32 %b) {
; RV32XTHEADMEMIDX-LABEL: lurb:
; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrb a1, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT: srai a2, a1, 31
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
+; RV32XTHEADMEMIDX-NEXT: th.lrb a0, a0, a1, 0
+; RV32XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV32XTHEADMEMIDX-NEXT: ret
;
; RV64XTHEADMEMIDX-LABEL: lurb:
@@ -571,37 +434,29 @@ define i64 @lurb(ptr %a, i32 %b) {
%1 = zext i32 %b to i64
%2 = getelementptr i8, ptr %a, i64 %1
%3 = load i8, ptr %2, align 1
- %4 = sext i8 %3 to i64
- %5 = add i64 %4, %4
- ret i64 %5
-}
-
-define i64 @lrbu(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrbu:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrbu a1, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrbu:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrbu a0, a0, a1, 0
-; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i8, ptr %a, i64 %b
+ %4 = sext i8 %3 to i32
+ %5 = add i32 %4, %4
+ ret i32 %5
+}
+
+define i32 @lrbu(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrbu:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrbu a0, a0, a1, 0
+; CHECK-NEXT: add a0, a0, a0
+; CHECK-NEXT: ret
+ %1 = getelementptr i8, ptr %a, iXLen %b
%2 = load i8, ptr %1, align 1
- %3 = zext i8 %2 to i64
- %4 = add i64 %3, %3
- ret i64 %4
+ %3 = zext i8 %2 to i32
+ %4 = add i32 %3, %3
+ ret i32 %4
}
-define i64 @lurbu(ptr %a, i32 %b) {
+define i32 @lurbu(ptr %a, i32 %b) {
; RV32XTHEADMEMIDX-LABEL: lurbu:
; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrbu a1, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
+; RV32XTHEADMEMIDX-NEXT: th.lrbu a0, a0, a1, 0
+; RV32XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV32XTHEADMEMIDX-NEXT: ret
;
; RV64XTHEADMEMIDX-LABEL: lurbu:
@@ -612,47 +467,32 @@ define i64 @lurbu(ptr %a, i32 %b) {
%1 = zext i32 %b to i64
%2 = getelementptr i8, ptr %a, i64 %1
%3 = load i8, ptr %2, align 1
- %4 = zext i8 %3 to i64
- %5 = add i64 %4, %4
- ret i64 %5
+ %4 = zext i8 %3 to i32
+ %5 = add i32 %4, %4
+ ret i32 %5
}
-define i16 @lrh_anyext(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrh_anyext:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrh a0, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrh_anyext:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrh a0, a0, a1, 1
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i16, ptr %a, i64 %b
+define i16 @lrh_anyext(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrh_anyext:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrh a0, a0, a1, 1
+; CHECK-NEXT: ret
+ %1 = getelementptr i16, ptr %a, iXLen %b
%2 = load i16, ptr %1, align 2
ret i16 %2
}
-define i64 @lrh(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrh:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrh a1, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT: srai a2, a1, 31
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrh:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrh a0, a0, a1, 1
-; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i16, ptr %a, i64 %b
+define i32 @lrh(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrh:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrh a0, a0, a1, 1
+; CHECK-NEXT: add a0, a0, a0
+; CHECK-NEXT: ret
+ %1 = getelementptr i16, ptr %a, iXLen %b
%2 = load i16, ptr %1, align 2
- %3 = sext i16 %2 to i64
- %4 = add i64 %3, %3
- ret i64 %4
+ %3 = sext i16 %2 to i32
+ %4 = add i32 %3, %3
+ ret i32 %4
}
define i16 @lurh_anyext(ptr %a, i32 %b) {
@@ -671,15 +511,11 @@ define i16 @lurh_anyext(ptr %a, i32 %b) {
ret i16 %3
}
-define i64 @lurh(ptr %a, i32 %b) {
+define i32 @lurh(ptr %a, i32 %b) {
; RV32XTHEADMEMIDX-LABEL: lurh:
; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrh a1, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT: srai a2, a1, 31
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
+; RV32XTHEADMEMIDX-NEXT: th.lrh a0, a0, a1, 1
+; RV32XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV32XTHEADMEMIDX-NEXT: ret
;
; RV64XTHEADMEMIDX-LABEL: lurh:
@@ -690,37 +526,29 @@ define i64 @lurh(ptr %a, i32 %b) {
%1 = zext i32 %b to i64
%2 = getelementptr i16, ptr %a, i64 %1
%3 = load i16, ptr %2, align 2
- %4 = sext i16 %3 to i64
- %5 = add i64 %4, %4
- ret i64 %5
-}
-
-define i64 @lrhu(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrhu:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrhu a1, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrhu:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrhu a0, a0, a1, 1
-; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i16, ptr %a, i64 %b
+ %4 = sext i16 %3 to i32
+ %5 = add i32 %4, %4
+ ret i32 %5
+}
+
+define i32 @lrhu(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrhu:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrhu a0, a0, a1, 1
+; CHECK-NEXT: add a0, a0, a0
+; CHECK-NEXT: ret
+ %1 = getelementptr i16, ptr %a, iXLen %b
%2 = load i16, ptr %1, align 2
- %3 = zext i16 %2 to i64
- %4 = add i64 %3, %3
- ret i64 %4
+ %3 = zext i16 %2 to i32
+ %4 = add i32 %3, %3
+ ret i32 %4
}
-define i64 @lurhu(ptr %a, i32 %b) {
+define i32 @lurhu(ptr %a, i32 %b) {
; RV32XTHEADMEMIDX-LABEL: lurhu:
; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrhu a1, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
+; RV32XTHEADMEMIDX-NEXT: th.lrhu a0, a0, a1, 1
+; RV32XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV32XTHEADMEMIDX-NEXT: ret
;
; RV64XTHEADMEMIDX-LABEL: lurhu:
@@ -731,27 +559,22 @@ define i64 @lurhu(ptr %a, i32 %b) {
%1 = zext i32 %b to i64
%2 = getelementptr i16, ptr %a, i64 %1
%3 = load i16, ptr %2, align 2
- %4 = zext i16 %3 to i64
- %5 = add i64 %4, %4
- ret i64 %5
+ %4 = zext i16 %3 to i32
+ %5 = add i32 %4, %4
+ ret i32 %5
}
-define i32 @lrw_anyext(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrw_anyext:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: th.lrw a0, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrw_anyext:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: th.lrw a0, a0, a1, 2
-; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i32, ptr %a, i64 %b
+define i32 @lrw_anyext(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrw_anyext:
+; CHECK: # %bb.0:
+; CHECK-NEXT: th.lrw a0, a0, a1, 2
+; CHECK-NEXT: ret
+ %1 = getelementptr i32, ptr %a, iXLen %b
%2 = load i32, ptr %1, align 4
ret i32 %2
}
-define i64 @lrw(ptr %a, i64 %b) {
+define i64 @lrw(ptr %a, iXLen %b) {
; RV32XTHEADMEMIDX-LABEL: lrw:
; RV32XTHEADMEMIDX: # %bb.0:
; RV32XTHEADMEMIDX-NEXT: th.lrw a1, a0, a1, 2
@@ -767,7 +590,7 @@ define i64 @lrw(ptr %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: th.lrw a0, a0, a1, 2
; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i32, ptr %a, i64 %b
+ %1 = getelementptr i32, ptr %a, iXLen %b
%2 = load i32, ptr %1, align 4
%3 = sext i32 %2 to i64
%4 = add i64 %3, %3
@@ -814,7 +637,7 @@ define i64 @lurw(ptr %a, i32 %b) {
ret i64 %5
}
-define i64 @lrwu(ptr %a, i64 %b) {
+define i64 @lrwu(ptr %a, iXLen %b) {
; RV32XTHEADMEMIDX-LABEL: lrwu:
; RV32XTHEADMEMIDX: # %bb.0:
; RV32XTHEADMEMIDX-NEXT: th.lrw a1, a0, a1, 2
@@ -827,7 +650,7 @@ define i64 @lrwu(ptr %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: th.lrwu a0, a0, a1, 2
; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i32, ptr %a, i64 %b
+ %1 = getelementptr i32, ptr %a, iXLen %b
%2 = load i32, ptr %1, align 4
%3 = zext i32 %2 to i64
%4 = add i64 %3, %3
@@ -855,7 +678,7 @@ define i64 @lurwu(ptr %a, i32 %b) {
ret i64 %5
}
-define i64 @lrd(ptr %a, i64 %b) {
+define i64 @lrd(ptr %a, iXLen %b) {
; RV32XTHEADMEMIDX-LABEL: lrd:
; RV32XTHEADMEMIDX: # %bb.0:
; RV32XTHEADMEMIDX-NEXT: th.lrw a2, a0, a1, 3
@@ -872,23 +695,23 @@ define i64 @lrd(ptr %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: th.lrd a0, a0, a1, 3
; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV64XTHEADMEMIDX-NEXT: ret
- %1 = getelementptr i64, ptr %a, i64 %b
+ %1 = getelementptr i64, ptr %a, iXLen %b
%2 = load i64, ptr %1, align 8
%3 = add i64 %2, %2
ret i64 %3
}
-define i64 @lrd_2(ptr %a, i64 %b) {
+define i64 @lrd_2(ptr %a, iXLen %b) {
; RV32XTHEADMEMIDX-LABEL: lrd_2:
; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: addi a2, a0, 96
-; RV32XTHEADMEMIDX-NEXT: th.lrw a2, a2, a1, 3
-; RV32XTHEADMEMIDX-NEXT: addi a0, a0, 100
-; RV32XTHEADMEMIDX-NEXT: th.lrw a1, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT: add a0, a2, a2
-; RV32XTHEADMEMIDX-NEXT: sltu a2, a0, a2
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a1
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
+; RV32XTHEADMEMIDX-NEXT: slli a1, a1, 3
+; RV32XTHEADMEMIDX-NEXT: add a0, a1, a0
+; RV32XTHEADMEMIDX-NEXT: lw a1, 96(a0)
+; RV32XTHEADMEMIDX-NEXT: lw a2, 100(a0)
+; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1
+; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1
+; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2
+; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1
; RV32XTHEADMEMIDX-NEXT: ret
;
; RV64XTHEADMEMIDX-LABEL: lrd_2:
@@ -897,8 +720,8 @@ define i64 @lrd_2(ptr %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: th.lrd a0, a0, a1, 3
; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0
; RV64XTHEADMEMIDX-NEXT: ret
- %1 = add i64 %b, 12
- %2 = getelementptr i64, ptr %a, i64 %1
+ %1 = add iXLen %b, 12
+ %2 = getelementptr i64, ptr %a, iXLen %1
%3 = load i64, ptr %2, align 8
%4 = add i64 %3, %3
ret i64 %4
@@ -928,20 +751,14 @@ define i64 @lurd(ptr %a, i32 %b) {
ret i64 %4
}
-define void @srb(ptr %a, i64 %b, i8 %c) {
-; RV32XTHEADMEMIDX-LABEL: srb:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3
-; RV32XTHEADMEMIDX-NEXT: th.srb a3, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: srb:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV64XTHEADMEMIDX-NEXT: th.srb a2, a0, a1, 0
-; RV64XTHEADMEMIDX-NEXT: ret
+define void @srb(ptr %a, iXLen %b, i8 %c) {
+; CHECK-LABEL: srb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a2, a2, a2
+; CHECK-NEXT: th.srb a2, a0, a1, 0
+; CHECK-NEXT: ret
%1 = add i8 %c, %c
- %2 = getelementptr i8, ptr %a, i64 %b
+ %2 = getelementptr i8, ptr %a, iXLen %b
store i8 %1, ptr %2, align 1
ret void
}
@@ -965,20 +782,14 @@ define void @surb(ptr %a, i32 %b, i8 %c) {
ret void
}
-define void @srh(ptr %a, i64 %b, i16 %c) {
-; RV32XTHEADMEMIDX-LABEL: srh:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3
-; RV32XTHEADMEMIDX-NEXT: th.srh a3, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: srh:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV64XTHEADMEMIDX-NEXT: th.srh a2, a0, a1, 1
-; RV64XTHEADMEMIDX-NEXT: ret
+define void @srh(ptr %a, iXLen %b, i16 %c) {
+; CHECK-LABEL: srh:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a2, a2, a2
+; CHECK-NEXT: th.srh a2, a0, a1, 1
+; CHECK-NEXT: ret
%1 = add i16 %c, %c
- %2 = getelementptr i16, ptr %a, i64 %b
+ %2 = getelementptr i16, ptr %a, iXLen %b
store i16 %1, ptr %2, align 2
ret void
}
@@ -1002,20 +813,14 @@ define void @surh(ptr %a, i32 %b, i16 %c) {
ret void
}
-define void @srw(ptr %a, i64 %b, i32 %c) {
-; RV32XTHEADMEMIDX-LABEL: srw:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3
-; RV32XTHEADMEMIDX-NEXT: th.srw a3, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: srw:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a2, a2, a2
-; RV64XTHEADMEMIDX-NEXT: th.srw a2, a0, a1, 2
-; RV64XTHEADMEMIDX-NEXT: ret
+define void @srw(ptr %a, iXLen %b, i32 %c) {
+; CHECK-LABEL: srw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a2, a2, a2
+; CHECK-NEXT: th.srw a2, a0, a1, 2
+; CHECK-NEXT: ret
%1 = add i32 %c, %c
- %2 = getelementptr i32, ptr %a, i64 %b
+ %2 = getelementptr i32, ptr %a, iXLen %b
store i32 %1, ptr %2, align 4
ret void
}
@@ -1039,16 +844,16 @@ define void @surw(ptr %a, i32 %b, i32 %c) {
ret void
}
-define void @srd(ptr %a, i64 %b, i64 %c) {
+define void @srd(ptr %a, iXLen %b, i64 %c) {
; RV32XTHEADMEMIDX-LABEL: srd:
; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a2, a3, a3
-; RV32XTHEADMEMIDX-NEXT: add a4, a4, a4
-; RV32XTHEADMEMIDX-NEXT: sltu a3, a2, a3
-; RV32XTHEADMEMIDX-NEXT: th.srw a2, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT: add a3, a4, a3
+; RV32XTHEADMEMIDX-NEXT: add a4, a2, a2
+; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3
+; RV32XTHEADMEMIDX-NEXT: sltu a2, a4, a2
+; RV32XTHEADMEMIDX-NEXT: th.srw a4, a0, a1, 3
+; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2
; RV32XTHEADMEMIDX-NEXT: addi a0, a0, 4
-; RV32XTHEADMEMIDX-NEXT: th.srw a3, a0, a1, 3
+; RV32XTHEADMEMIDX-NEXT: th.srw a2, a0, a1, 3
; RV32XTHEADMEMIDX-NEXT: ret
;
; RV64XTHEADMEMIDX-LABEL: srd:
@@ -1057,7 +862,7 @@ define void @srd(ptr %a, i64 %b, i64 %c) {
; RV64XTHEADMEMIDX-NEXT: th.srd a2, a0, a1, 3
; RV64XTHEADMEMIDX-NEXT: ret
%1 = add i64 %c, %c
- %2 = getelementptr i64, ptr %a, i64 %b
+ %2 = getelementptr i64, ptr %a, iXLen %b
store i64 %1, ptr %2, align 8
ret void
}
@@ -1087,24 +892,18 @@ define void @surd(ptr %a, i32 %b, i64 %c) {
}
define ptr @test_simm5(ptr %base, i32 %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: test_simm5:
-; RV32XTHEADMEMIDX: # %bb.0:
-; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT: th.swia a1, (a0), -12, 2
-; RV32XTHEADMEMIDX-NEXT: ret
-;
-; RV64XTHEADMEMIDX-LABEL: test_simm5:
-; RV64XTHEADMEMIDX: # %bb.0:
-; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT: th.swia a1, (a0), -12, 2
-; RV64XTHEADMEMIDX-NEXT: ret
+; CHECK-LABEL: test_simm5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: th.swia a1, (a0), -12, 2
+; CHECK-NEXT: ret
%addr.1 = getelementptr i32, ptr %base, i32 -12
%res = add i32 %a, %b
store i32 %res, ptr %base
ret ptr %addr.1
}
-define i64 @lrd_large_shift(ptr %a, i64 %b) {
+define i64 @lrd_large_shift(ptr %a, iXLen %b) {
; RV32XTHEADMEMIDX-LABEL: lrd_large_shift:
; RV32XTHEADMEMIDX: # %bb.0:
; RV32XTHEADMEMIDX-NEXT: slli a1, a1, 5
@@ -1119,14 +918,14 @@ define i64 @lrd_large_shift(ptr %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: add a0, a1, a0
; RV64XTHEADMEMIDX-NEXT: ld a0, 384(a0)
; RV64XTHEADMEMIDX-NEXT: ret
- %1 = add i64 %b, 12
- %2 = shl i64 %1, 2
- %3 = getelementptr i64, ptr %a, i64 %2
+ %1 = add iXLen %b, 12
+ %2 = shl iXLen %1, 2
+ %3 = getelementptr i64, ptr %a, iXLen %2
%4 = load i64, ptr %3, align 8
ret i64 %4
}
-define i64 @lrd_large_offset(ptr %a, i64 %b) {
+define i64 @lrd_large_offset(ptr %a, iXLen %b) {
; RV32XTHEADMEMIDX-LABEL: lrd_large_offset:
; RV32XTHEADMEMIDX: # %bb.0:
; RV32XTHEADMEMIDX-NEXT: slli a1, a1, 3
@@ -1145,8 +944,8 @@ define i64 @lrd_large_offset(ptr %a, i64 %b) {
; RV64XTHEADMEMIDX-NEXT: add a0, a0, a1
; RV64XTHEADMEMIDX-NEXT: ld a0, 1792(a0)
; RV64XTHEADMEMIDX-NEXT: ret
- %1 = add i64 %b, 12000
- %2 = getelementptr i64, ptr %a, i64 %1
+ %1 = add iXLen %b, 12000
+ %2 = getelementptr i64, ptr %a, iXLen %1
%3 = load i64, ptr %2, align 8
ret i64 %3
}
diff --git a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
index f9db686..1ef37f7 100644
--- a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
+++ b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
@@ -242,7 +242,7 @@ define void @foo7(ptr nocapture %p) nounwind {
; RV64ZDINX: # %bb.0: # %entry
; RV64ZDINX-NEXT: lui a1, %hi(d)
; RV64ZDINX-NEXT: addi a2, a1, %lo(d)
-; RV64ZDINX-NEXT: lwu a2, 8(a2)
+; RV64ZDINX-NEXT: lw a2, 8(a2)
; RV64ZDINX-NEXT: lwu a1, %lo(d+4)(a1)
; RV64ZDINX-NEXT: slli a2, a2, 32
; RV64ZDINX-NEXT: or a1, a2, a1
@@ -337,7 +337,7 @@ define void @foo9(ptr nocapture %p) nounwind {
; RV64ZDINX: # %bb.0: # %entry
; RV64ZDINX-NEXT: lui a1, %hi(e)
; RV64ZDINX-NEXT: addi a2, a1, %lo(e)
-; RV64ZDINX-NEXT: lwu a2, 4(a2)
+; RV64ZDINX-NEXT: lw a2, 4(a2)
; RV64ZDINX-NEXT: lwu a1, %lo(e)(a1)
; RV64ZDINX-NEXT: slli a2, a2, 32
; RV64ZDINX-NEXT: or a1, a2, a1
@@ -480,7 +480,7 @@ define double @foo13(ptr nocapture %p) nounwind {
; RV64ZDINX-LABEL: foo13:
; RV64ZDINX: # %bb.0: # %entry
; RV64ZDINX-NEXT: lui a0, %hi(f)
-; RV64ZDINX-NEXT: lwu a1, %lo(f+8)(a0)
+; RV64ZDINX-NEXT: lw a1, %lo(f+8)(a0)
; RV64ZDINX-NEXT: lwu a0, %lo(f+4)(a0)
; RV64ZDINX-NEXT: slli a1, a1, 32
; RV64ZDINX-NEXT: or a0, a1, a0
diff --git a/llvm/test/CodeGen/SPARC/tls-sp.ll b/llvm/test/CodeGen/SPARC/tls-sp.ll
new file mode 100644
index 0000000..de9af01
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/tls-sp.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=sparc -relocation-model=pic < %s | FileCheck --check-prefix=SPARC %s
+; RUN: llc -mtriple=sparc64 -relocation-model=pic < %s | FileCheck --check-prefix=SPARC64 %s
+
+@x = external thread_local global i8
+
+;; Test that we don't over-allocate stack space when calling __tls_get_addr
+;; with the call frame pseudos able to be eliminated.
+define ptr @no_alloca() nounwind {
+; SPARC-LABEL: no_alloca:
+; SPARC: ! %bb.0: ! %entry
+; SPARC-NEXT: save %sp, -96, %sp
+; SPARC-NEXT: .Ltmp0:
+; SPARC-NEXT: call .Ltmp1
+; SPARC-NEXT: .Ltmp2:
+; SPARC-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC-NEXT: .Ltmp1:
+; SPARC-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC-NEXT: add %i0, %o7, %i0
+; SPARC-NEXT: sethi %tgd_hi22(x), %i1
+; SPARC-NEXT: add %i1, %tgd_lo10(x), %i1
+; SPARC-NEXT: add %i0, %i1, %o0, %tgd_add(x)
+; SPARC-NEXT: call __tls_get_addr, %tgd_call(x)
+; SPARC-NEXT: nop
+; SPARC-NEXT: ret
+; SPARC-NEXT: restore %g0, %o0, %o0
+;
+; SPARC64-LABEL: no_alloca:
+; SPARC64: ! %bb.0: ! %entry
+; SPARC64-NEXT: save %sp, -128, %sp
+; SPARC64-NEXT: .Ltmp0:
+; SPARC64-NEXT: rd %pc, %o7
+; SPARC64-NEXT: .Ltmp2:
+; SPARC64-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC64-NEXT: .Ltmp1:
+; SPARC64-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC64-NEXT: add %i0, %o7, %i0
+; SPARC64-NEXT: sethi %tgd_hi22(x), %i1
+; SPARC64-NEXT: add %i1, %tgd_lo10(x), %i1
+; SPARC64-NEXT: add %i0, %i1, %o0, %tgd_add(x)
+; SPARC64-NEXT: call __tls_get_addr, %tgd_call(x)
+; SPARC64-NEXT: nop
+; SPARC64-NEXT: ret
+; SPARC64-NEXT: restore %g0, %o0, %o0
+entry:
+ %0 = call ptr @llvm.threadlocal.address.p0(ptr @x)
+ ret ptr %0
+}
+
+;; Test that %sp is valid for the call to __tls_get_addr. We store to a dynamic
+;; alloca in order to prevent eliminating any call frame pseudos from the call.
+define ptr @dynamic_alloca(i64 %n) nounwind {
+; SPARC-LABEL: dynamic_alloca:
+; SPARC: ! %bb.0: ! %entry
+; SPARC-NEXT: save %sp, -96, %sp
+; SPARC-NEXT: .Ltmp3:
+; SPARC-NEXT: call .Ltmp4
+; SPARC-NEXT: .Ltmp5:
+; SPARC-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp5-.Ltmp3)), %i0
+; SPARC-NEXT: .Ltmp4:
+; SPARC-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.Ltmp3)), %i0
+; SPARC-NEXT: add %i0, %o7, %i0
+; SPARC-NEXT: sethi %tgd_hi22(x), %i2
+; SPARC-NEXT: add %i2, %tgd_lo10(x), %i2
+; SPARC-NEXT: add %i0, %i2, %o0, %tgd_add(x)
+; SPARC-NEXT: call __tls_get_addr, %tgd_call(x)
+; SPARC-NEXT: nop
+; SPARC-NEXT: add %i1, 7, %i0
+; SPARC-NEXT: and %i0, -8, %i0
+; SPARC-NEXT: sub %sp, %i0, %i0
+; SPARC-NEXT: add %i0, -8, %sp
+; SPARC-NEXT: mov 1, %i1
+; SPARC-NEXT: stb %i1, [%i0+88]
+; SPARC-NEXT: ret
+; SPARC-NEXT: restore %g0, %o0, %o0
+;
+; SPARC64-LABEL: dynamic_alloca:
+; SPARC64: ! %bb.0: ! %entry
+; SPARC64-NEXT: save %sp, -128, %sp
+; SPARC64-NEXT: .Ltmp3:
+; SPARC64-NEXT: rd %pc, %o7
+; SPARC64-NEXT: .Ltmp5:
+; SPARC64-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp5-.Ltmp3)), %i1
+; SPARC64-NEXT: .Ltmp4:
+; SPARC64-NEXT: or %i1, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.Ltmp3)), %i1
+; SPARC64-NEXT: add %i1, %o7, %i1
+; SPARC64-NEXT: sethi %tgd_hi22(x), %i2
+; SPARC64-NEXT: add %i2, %tgd_lo10(x), %i2
+; SPARC64-NEXT: add %i1, %i2, %o0, %tgd_add(x)
+; SPARC64-NEXT: call __tls_get_addr, %tgd_call(x)
+; SPARC64-NEXT: nop
+; SPARC64-NEXT: add %i0, 15, %i0
+; SPARC64-NEXT: and %i0, -16, %i0
+; SPARC64-NEXT: sub %sp, %i0, %i0
+; SPARC64-NEXT: mov %i0, %sp
+; SPARC64-NEXT: mov 1, %i1
+; SPARC64-NEXT: stb %i1, [%i0+2175]
+; SPARC64-NEXT: ret
+; SPARC64-NEXT: restore %g0, %o0, %o0
+entry:
+ %0 = call ptr @llvm.threadlocal.address.p0(ptr @x)
+ %1 = alloca i8, i64 %n
+ store i8 1, ptr %1
+ ret ptr %0
+}
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll
index 3d46b52..70030ca 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll
@@ -1,4 +1,5 @@
; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK: %[[#extinst_id:]] = OpExtInstImport "OpenCL.std"
@@ -337,3 +338,68 @@ entry:
}
declare float @llvm.fma.f32(float, float, float)
+
+; CHECK: OpFunction
+; CHECK: %[[#d:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#fracPtr:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#integralPtr:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#varPtr:]] = OpVariable %[[#]] Function
+; CHECK: %[[#frac:]] = OpExtInst %[[#var2]] %[[#extinst_id]] modf %[[#d]] %[[#varPtr]]
+; CHECK: %[[#integral:]] = OpLoad %[[#var2]] %[[#varPtr]]
+; CHECK: OpStore %[[#fracPtr]] %[[#frac]]
+; CHECK: OpStore %[[#integralPtr]] %[[#integral]]
+; CHECK: OpFunctionEnd
+define void @TestModf(double %d, ptr addrspace(1) %frac, ptr addrspace(1) %integral) {
+entry:
+ %4 = tail call { double, double } @llvm.modf.f64(double %d)
+ %5 = extractvalue { double, double } %4, 0
+ %6 = extractvalue { double, double } %4, 1
+ store double %5, ptr addrspace(1) %frac, align 8
+ store double %6, ptr addrspace(1) %integral, align 8
+ ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[#d:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#fracPtr:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#integralPtr:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#entryBlock:]] = OpLabel
+; CHECK: %[[#varPtr:]] = OpVariable %[[#]] Function
+; CHECK: OpBranchConditional %[[#]] %[[#lor_lhs_falseBlock:]] %[[#if_thenBlock:]]
+; CHECK: %[[#lor_lhs_falseBlock]] = OpLabel
+; CHECK: OpBranchConditional %[[#]] %[[#if_endBlock:]] %[[#if_thenBlock]]
+; CHECK: %[[#if_thenBlock]] = OpLabel
+; CHECK: OpBranch %[[#returnBlock:]]
+; CHECK: %[[#if_endBlock]] = OpLabel
+; CHECK: %[[#frac:]] = OpExtInst %[[#var2]] %[[#extinst_id]] modf %[[#d]] %[[#varPtr]]
+; CHECK: %[[#integral:]] = OpLoad %[[#var2]] %[[#varPtr]]
+; CHECK: OpStore %[[#fracPtr]] %[[#frac]]
+; CHECK: OpStore %[[#integralPtr]] %[[#integral]]
+; CHECK: OpFunctionEnd
+define dso_local void @TestModf2(double noundef %d, ptr noundef %frac, ptr noundef %integral) {
+entry:
+ %0 = load ptr, ptr %frac, align 8
+ %tobool = icmp ne ptr %0, null
+ br i1 %tobool, label %lor.lhs.false, label %if.then
+
+lor.lhs.false:
+ %1 = load ptr, ptr %integral, align 8
+ %tobool1 = icmp ne ptr %1, null
+ br i1 %tobool1, label %if.end, label %if.then
+
+if.then:
+ br label %return
+
+if.end:
+ %6 = tail call { double, double } @llvm.modf.f64(double %d)
+ %7 = extractvalue { double, double } %6, 0
+ %8 = extractvalue { double, double } %6, 1
+ store double %7, ptr %frac, align 4
+ store double %8, ptr %integral, align 4
+ br label %return
+
+return:
+ ret void
+}
+
+declare { double, double } @llvm.modf.f64(double)
diff --git a/llvm/test/CodeGen/SPIRV/pointers/resource-vector-load-store.ll b/llvm/test/CodeGen/SPIRV/pointers/resource-vector-load-store.ll
new file mode 100644
index 0000000..edd2cc4
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/resource-vector-load-store.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -O3 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - | FileCheck %s --match-full-lines
+; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
+
+@.str = private unnamed_addr constant [7 x i8] c"buffer\00", align 1
+
+define void @main() "hlsl.shader"="pixel" {
+; CHECK: %25 = OpFunction %2 None %3 ; -- Begin function main
+; CHECK-NEXT: %1 = OpLabel
+; CHECK-NEXT: %26 = OpVariable %14 Function %23
+; CHECK-NEXT: %27 = OpLoad %7 %24
+; CHECK-NEXT: %28 = OpImageRead %5 %27 %16
+; CHECK-NEXT: %29 = OpCompositeExtract %4 %28 0
+; CHECK-NEXT: %30 = OpCompositeExtract %4 %28 1
+; CHECK-NEXT: %31 = OpFAdd %4 %30 %29
+; CHECK-NEXT: %32 = OpCompositeInsert %5 %31 %28 0
+; CHECK-NEXT: %33 = OpLoad %7 %24
+; CHECK-NEXT: OpImageWrite %33 %16 %32
+; CHECK-NEXT: OpReturn
+; CHECK-NEXT: OpFunctionEnd
+entry:
+ %0 = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str)
+ %1 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 0) %0, i32 0)
+ %2 = load <4 x float>, ptr addrspace(11) %1, align 16
+ %3 = extractelement <4 x float> %2, i64 0
+ %4 = extractelement <4 x float> %2, i64 1
+ %add.i = fadd reassoc nnan ninf nsz arcp afn float %4, %3
+ %5 = insertelement <4 x float> %2, float %add.i, i64 0
+ store <4 x float> %5, ptr addrspace(11) %1, align 16
+ ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare target("spirv.Image", float, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32, i32, i32, i32, i1, ptr) #0
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 0), i32) #0
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index bbf4d50..8a6a303 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -16,31 +16,31 @@ define dso_local void @m() local_unnamed_addr #1 {
; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
; CHECK-NEXT: aghi %r15, -168
; CHECK-NEXT: lhrl %r1, f+4
+; CHECK-NEXT: sll %r1, 8
; CHECK-NEXT: larl %r2, f
-; CHECK-NEXT: llc %r2, 6(%r2)
-; CHECK-NEXT: larl %r3, e
-; CHECK-NEXT: lb %r0, 3(%r3)
-; CHECK-NEXT: rosbg %r2, %r1, 32, 55, 8
-; CHECK-NEXT: vlvgp %v0, %r2, %r0
-; CHECK-NEXT: vlvgf %v0, %r2, 0
-; CHECK-NEXT: vlvgf %v0, %r2, 2
-; CHECK-NEXT: vlvgp %v1, %r0, %r2
-; CHECK-NEXT: vlvgp %v2, %r2, %r2
-; CHECK-NEXT: lr %r1, %r2
+; CHECK-NEXT: ic %r1, 6(%r2)
+; CHECK-NEXT: larl %r2, e
+; CHECK-NEXT: lb %r0, 3(%r2)
+; CHECK-NEXT: vlvgp %v0, %r0, %r1
+; CHECK-NEXT: vlvgp %v1, %r1, %r0
+; CHECK-NEXT: vlvgf %v1, %r1, 0
+; CHECK-NEXT: vlvgf %v1, %r1, 2
+; CHECK-NEXT: vlvgp %v2, %r1, %r1
+; CHECK-NEXT: # kill: def $r1l killed $r1l killed $r1d
; CHECK-NEXT: nilh %r1, 255
; CHECK-NEXT: chi %r1, 128
; CHECK-NEXT: ipm %r1
; CHECK-NEXT: risbg %r1, %r1, 63, 191, 36
+; CHECK-NEXT: vlvgf %v0, %r0, 0
+; CHECK-NEXT: vlvgf %v0, %r0, 2
; CHECK-NEXT: vgbm %v3, 30583
; CHECK-NEXT: vn %v0, %v0, %v3
-; CHECK-NEXT: vlvgf %v1, %r0, 0
-; CHECK-NEXT: vlvgf %v1, %r0, 2
; CHECK-NEXT: vn %v1, %v1, %v3
; CHECK-NEXT: vrepf %v2, %v2, 1
; CHECK-NEXT: vn %v2, %v2, %v3
; CHECK-NEXT: vrepif %v3, 127
-; CHECK-NEXT: vchlf %v0, %v0, %v3
-; CHECK-NEXT: vlgvf %r13, %v0, 0
+; CHECK-NEXT: vchlf %v1, %v1, %v3
+; CHECK-NEXT: vlgvf %r13, %v1, 0
; CHECK-NEXT: vchlf %v2, %v2, %v3
; CHECK-NEXT: vlgvf %r3, %v2, 1
; CHECK-NEXT: nilf %r3, 1
@@ -54,13 +54,13 @@ define dso_local void @m() local_unnamed_addr #1 {
; CHECK-NEXT: nilf %r14, 1
; CHECK-NEXT: rosbg %r2, %r14, 32, 51, 12
; CHECK-NEXT: rosbg %r2, %r13, 52, 52, 11
-; CHECK-NEXT: vlgvf %r13, %v0, 1
+; CHECK-NEXT: vlgvf %r13, %v1, 1
; CHECK-NEXT: rosbg %r2, %r13, 53, 53, 10
-; CHECK-NEXT: vlgvf %r13, %v0, 2
+; CHECK-NEXT: vlgvf %r13, %v1, 2
; CHECK-NEXT: rosbg %r2, %r13, 54, 54, 9
-; CHECK-NEXT: vlgvf %r13, %v0, 3
+; CHECK-NEXT: vlgvf %r13, %v1, 3
; CHECK-NEXT: rosbg %r2, %r13, 55, 55, 8
-; CHECK-NEXT: vchlf %v0, %v1, %v3
+; CHECK-NEXT: vchlf %v0, %v0, %v3
; CHECK-NEXT: vlgvf %r13, %v0, 0
; CHECK-NEXT: rosbg %r2, %r13, 56, 56, 7
; CHECK-NEXT: vlgvf %r13, %v0, 1
diff --git a/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll b/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll
index 9acdd7e..b70505c 100644
--- a/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll
+++ b/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll
@@ -17,6 +17,7 @@ declare void @_ZNSsC1EPKcRKSaIcE() unnamed_addr #0
; CHECK: .LBB0_2
; Function Attrs: nounwind
define hidden void @_ZN4llvm14DOTGraphTraitsIPNS_13ScheduleDAGMIEE17getEdgeAttributesEPKNS_5SUnitENS_13SUnitIteratorEPKNS_11ScheduleDAGE() #0 align 2 {
+ %a = alloca i8
br i1 undef, label %1, label %2
; <label>:1: ; preds = %0
@@ -25,7 +26,7 @@ define hidden void @_ZN4llvm14DOTGraphTraitsIPNS_13ScheduleDAGMIEE17getEdgeAttri
br label %3
; <label>:2: ; preds = %0
- call void @llvm.lifetime.start.p0(i64 1, ptr undef) #0
+ call void @llvm.lifetime.start.p0(i64 1, ptr %a) #0
call void @_ZNSaIcEC2Ev() #0
br label %3
diff --git a/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
new file mode 100644
index 0000000..8030438
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+declare i32 @memcmp(ptr, ptr, i32)
+
+define i1 @memcmp_expand_3(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_3:
+; CHECK: .functype memcmp_expand_3 (i32, i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.load16_u $push7=, 0($0):p2align=0
+; CHECK-NEXT: i32.load16_u $push6=, 0($1):p2align=0
+; CHECK-NEXT: i32.xor $push8=, $pop7, $pop6
+; CHECK-NEXT: i32.const $push0=, 2
+; CHECK-NEXT: i32.add $push3=, $0, $pop0
+; CHECK-NEXT: i32.load8_u $push4=, 0($pop3)
+; CHECK-NEXT: i32.const $push13=, 2
+; CHECK-NEXT: i32.add $push1=, $1, $pop13
+; CHECK-NEXT: i32.load8_u $push2=, 0($pop1)
+; CHECK-NEXT: i32.xor $push5=, $pop4, $pop2
+; CHECK-NEXT: i32.or $push9=, $pop8, $pop5
+; CHECK-NEXT: i32.const $push10=, 65535
+; CHECK-NEXT: i32.and $push11=, $pop9, $pop10
+; CHECK-NEXT: i32.eqz $push12=, $pop11
+; CHECK-NEXT: return $pop12
+ %cmp_3 = call i32 @memcmp(ptr %a, ptr %b, i32 3)
+ %res = icmp eq i32 %cmp_3, 0
+ ret i1 %res
+}
+
+define i1 @memcmp_expand_5(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_5:
+; CHECK: .functype memcmp_expand_5 (i32, i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.load $push7=, 0($0):p2align=0
+; CHECK-NEXT: i32.load $push6=, 0($1):p2align=0
+; CHECK-NEXT: i32.xor $push8=, $pop7, $pop6
+; CHECK-NEXT: i32.const $push0=, 4
+; CHECK-NEXT: i32.add $push3=, $0, $pop0
+; CHECK-NEXT: i32.load8_u $push4=, 0($pop3)
+; CHECK-NEXT: i32.const $push11=, 4
+; CHECK-NEXT: i32.add $push1=, $1, $pop11
+; CHECK-NEXT: i32.load8_u $push2=, 0($pop1)
+; CHECK-NEXT: i32.xor $push5=, $pop4, $pop2
+; CHECK-NEXT: i32.or $push9=, $pop8, $pop5
+; CHECK-NEXT: i32.eqz $push10=, $pop9
+; CHECK-NEXT: return $pop10
+ %cmp_5 = call i32 @memcmp(ptr %a, ptr %b, i32 5)
+ %res = icmp eq i32 %cmp_5, 0
+ ret i1 %res
+}
+
+define i1 @memcmp_expand_7(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_7:
+; CHECK: .functype memcmp_expand_7 (i32, i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.load $push7=, 0($0):p2align=0
+; CHECK-NEXT: i32.load $push6=, 0($1):p2align=0
+; CHECK-NEXT: i32.xor $push8=, $pop7, $pop6
+; CHECK-NEXT: i32.const $push0=, 3
+; CHECK-NEXT: i32.add $push3=, $0, $pop0
+; CHECK-NEXT: i32.load $push4=, 0($pop3):p2align=0
+; CHECK-NEXT: i32.const $push11=, 3
+; CHECK-NEXT: i32.add $push1=, $1, $pop11
+; CHECK-NEXT: i32.load $push2=, 0($pop1):p2align=0
+; CHECK-NEXT: i32.xor $push5=, $pop4, $pop2
+; CHECK-NEXT: i32.or $push9=, $pop8, $pop5
+; CHECK-NEXT: i32.eqz $push10=, $pop9
+; CHECK-NEXT: return $pop10
+ %cmp_7 = call i32 @memcmp(ptr %a, ptr %b, i32 7)
+ %res = icmp eq i32 %cmp_7, 0
+ ret i1 %res
+}
+
+; INFO: Negative test
+; Should not expand even with simd128
+define i1 @memcmp_expand_129(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_129:
+; CHECK: .functype memcmp_expand_129 (i32, i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 129
+; CHECK-NEXT: call $push1=, memcmp, $0, $1, $pop0
+; CHECK-NEXT: i32.eqz $push2=, $pop1
+; CHECK-NEXT: return $pop2
+ %cmp_129 = call i32 @memcmp(ptr %a, ptr %b, i32 129)
+ %res = icmp eq i32 %cmp_129, 0
+ ret i1 %res
+}
+
+define i1 @memcmp_expand_2(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_2:
+; CHECK: .functype memcmp_expand_2 (i32, i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.load16_u $push1=, 0($0):p2align=0
+; CHECK-NEXT: i32.load16_u $push0=, 0($1):p2align=0
+; CHECK-NEXT: i32.eq $push2=, $pop1, $pop0
+; CHECK-NEXT: return $pop2
+ %cmp_2 = call i32 @memcmp(ptr %a, ptr %b, i32 2)
+ %res = icmp eq i32 %cmp_2, 0
+ ret i1 %res
+}
+
+define i1 @memcmp_expand_2_align(ptr align(2) %a, ptr align(2) %b) {
+; CHECK-LABEL: memcmp_expand_2_align:
+; CHECK: .functype memcmp_expand_2_align (i32, i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.load16_u $push1=, 0($0)
+; CHECK-NEXT: i32.load16_u $push0=, 0($1)
+; CHECK-NEXT: i32.eq $push2=, $pop1, $pop0
+; CHECK-NEXT: return $pop2
+ %cmp_2 = call i32 @memcmp(ptr %a, ptr %b, i32 2)
+ %res = icmp eq i32 %cmp_2, 0
+ ret i1 %res
+}
+
+define i1 @memcmp_expand_8(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_8:
+; CHECK: .functype memcmp_expand_8 (i32, i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i64.load $push1=, 0($0):p2align=0
+; CHECK-NEXT: i64.load $push0=, 0($1):p2align=0
+; CHECK-NEXT: i64.eq $push2=, $pop1, $pop0
+; CHECK-NEXT: return $pop2
+ %cmp_8 = call i32 @memcmp(ptr %a, ptr %b, i32 8)
+ %res = icmp eq i32 %cmp_8, 0
+ ret i1 %res
+}
+
+; TODO: Should be using a single load i64x2 or equivalent in bitsizes
+define i1 @memcmp_expand_16(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_16:
+; CHECK: .functype memcmp_expand_16 (i32, i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i64.load $push7=, 0($0):p2align=0
+; CHECK-NEXT: i64.load $push6=, 0($1):p2align=0
+; CHECK-NEXT: i64.xor $push8=, $pop7, $pop6
+; CHECK-NEXT: i32.const $push0=, 8
+; CHECK-NEXT: i32.add $push3=, $0, $pop0
+; CHECK-NEXT: i64.load $push4=, 0($pop3):p2align=0
+; CHECK-NEXT: i32.const $push11=, 8
+; CHECK-NEXT: i32.add $push1=, $1, $pop11
+; CHECK-NEXT: i64.load $push2=, 0($pop1):p2align=0
+; CHECK-NEXT: i64.xor $push5=, $pop4, $pop2
+; CHECK-NEXT: i64.or $push9=, $pop8, $pop5
+; CHECK-NEXT: i64.eqz $push10=, $pop9
+; CHECK-NEXT: return $pop10
+ %cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
+ %res = icmp eq i32 %cmp_16, 0
+ ret i1 %res
+}
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
new file mode 100644
index 0000000..97c2311
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -0,0 +1,1413 @@
+; RUN: opt -S -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
+
+%struct.TwoInts = type { i32, i32 }
+%struct.ThreeInts = type { i32, i32, i32 }
+%struct.FourInts = type { i32, i32, i32, i32 }
+%struct.ThreeShorts = type { i16, i16, i16 }
+%struct.FourShorts = type { i16, i16, i16, i16 }
+%struct.FiveShorts = type { i16, i16, i16, i16, i16 }
+%struct.TwoBytes = type { i8, i8 }
+%struct.ThreeBytes = type { i8, i8, i8 }
+%struct.FourBytes = type { i8, i8, i8, i8 }
+%struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+
+; CHECK-LABEL: two_ints_same_op:
+; CHECK: loop
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %21, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.TwoInts, ptr %1, i32 %8
+ %10 = load i32, ptr %9, align 4
+ %11 = getelementptr inbounds %struct.TwoInts, ptr %2, i32 %8
+ %12 = load i32, ptr %11, align 4
+ %13 = add i32 %12, %10
+ %14 = getelementptr inbounds %struct.TwoInts, ptr %0, i32 %8
+ store i32 %13, ptr %14, align 4
+ %15 = getelementptr inbounds i8, ptr %9, i32 4
+ %16 = load i32, ptr %15, align 4
+ %17 = getelementptr inbounds i8, ptr %11, i32 4
+ %18 = load i32, ptr %17, align 4
+ %19 = add i32 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 4
+ store i32 %19, ptr %20, align 4
+ %21 = add nuw i32 %8, 1
+ %22 = icmp eq i32 %21, %3
+ br i1 %22, label %6, label %7
+}
+
+; CHECK-LABEL: two_ints_vary_op:
+; CHECK: loop
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.sub
+; CHECK: i32.store
+define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %21, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.TwoInts, ptr %1, i32 %8
+ %10 = load i32, ptr %9, align 4
+ %11 = getelementptr inbounds %struct.TwoInts, ptr %2, i32 %8
+ %12 = load i32, ptr %11, align 4
+ %13 = add i32 %12, %10
+ %14 = getelementptr inbounds %struct.TwoInts, ptr %0, i32 %8
+ store i32 %13, ptr %14, align 4
+ %15 = getelementptr inbounds i8, ptr %9, i32 4
+ %16 = load i32, ptr %15, align 4
+ %17 = getelementptr inbounds i8, ptr %11, i32 4
+ %18 = load i32, ptr %17, align 4
+ %19 = sub i32 %16, %18
+ %20 = getelementptr inbounds i8, ptr %14, i32 4
+ store i32 %19, ptr %20, align 4
+ %21 = add nuw i32 %8, 1
+ %22 = icmp eq i32 %21, %3
+ br i1 %22, label %6, label %7
+}
+
+; CHECK-LABEL: three_ints:
+; CHECK: loop
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+define hidden void @three_ints(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %27, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.ThreeInts, ptr %1, i32 %8
+ %10 = load i32, ptr %9, align 4
+ %11 = getelementptr inbounds %struct.ThreeInts, ptr %2, i32 %8
+ %12 = load i32, ptr %11, align 4
+ %13 = add nsw i32 %12, %10
+ %14 = getelementptr inbounds %struct.ThreeInts, ptr %0, i32 %8
+ store i32 %13, ptr %14, align 4
+ %15 = getelementptr inbounds i8, ptr %9, i32 4
+ %16 = load i32, ptr %15, align 4
+ %17 = getelementptr inbounds i8, ptr %11, i32 4
+ %18 = load i32, ptr %17, align 4
+ %19 = add nsw i32 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 4
+ store i32 %19, ptr %20, align 4
+ %21 = getelementptr inbounds i8, ptr %9, i32 8
+ %22 = load i32, ptr %21, align 4
+ %23 = getelementptr inbounds i8, ptr %11, i32 8
+ %24 = load i32, ptr %23, align 4
+ %25 = add nsw i32 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 8
+ store i32 %25, ptr %26, align 4
+ %27 = add nuw i32 %8, 1
+ %28 = icmp eq i32 %27, %3
+ br i1 %28, label %6, label %7
+}
+
+; CHECK-LABEL: three_shorts:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.mul
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.mul
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.mul
+; CHECK: i32.store16
+define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %27, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.ThreeShorts, ptr %1, i32 %8
+ %10 = load i16, ptr %9, align 2
+ %11 = getelementptr inbounds %struct.ThreeShorts, ptr %2, i32 %8
+ %12 = load i16, ptr %11, align 2
+ %13 = mul i16 %12, %10
+ %14 = getelementptr inbounds %struct.ThreeShorts, ptr %0, i32 %8
+ store i16 %13, ptr %14, align 2
+ %15 = getelementptr inbounds i8, ptr %9, i32 2
+ %16 = load i16, ptr %15, align 2
+ %17 = getelementptr inbounds i8, ptr %11, i32 2
+ %18 = load i16, ptr %17, align 2
+ %19 = mul i16 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 2
+ store i16 %19, ptr %20, align 2
+ %21 = getelementptr inbounds i8, ptr %9, i32 4
+ %22 = load i16, ptr %21, align 2
+ %23 = getelementptr inbounds i8, ptr %11, i32 4
+ %24 = load i16, ptr %23, align 2
+ %25 = mul i16 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 4
+ store i16 %25, ptr %26, align 2
+ %27 = add nuw i32 %8, 1
+ %28 = icmp eq i32 %27, %3
+ br i1 %28, label %6, label %7
+}
+
+; CHECK-LABEL: four_shorts_same_op:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourShorts, ptr %1, i32 %8
+ %10 = load i16, ptr %9, align 2
+ %11 = getelementptr inbounds %struct.FourShorts, ptr %2, i32 %8
+ %12 = load i16, ptr %11, align 2
+ %13 = sub i16 %10, %12
+ %14 = getelementptr inbounds %struct.FourShorts, ptr %0, i32 %8
+ store i16 %13, ptr %14, align 2
+ %15 = getelementptr inbounds i8, ptr %9, i32 2
+ %16 = load i16, ptr %15, align 2
+ %17 = getelementptr inbounds i8, ptr %11, i32 2
+ %18 = load i16, ptr %17, align 2
+ %19 = sub i16 %16, %18
+ %20 = getelementptr inbounds i8, ptr %14, i32 2
+ store i16 %19, ptr %20, align 2
+ %21 = getelementptr inbounds i8, ptr %9, i32 4
+ %22 = load i16, ptr %21, align 2
+ %23 = getelementptr inbounds i8, ptr %11, i32 4
+ %24 = load i16, ptr %23, align 2
+ %25 = sub i16 %22, %24
+ %26 = getelementptr inbounds i8, ptr %14, i32 4
+ store i16 %25, ptr %26, align 2
+ %27 = getelementptr inbounds i8, ptr %9, i32 6
+ %28 = load i16, ptr %27, align 2
+ %29 = getelementptr inbounds i8, ptr %11, i32 6
+ %30 = load i16, ptr %29, align 2
+ %31 = sub i16 %28, %30
+ %32 = getelementptr inbounds i8, ptr %14, i32 6
+ store i16 %31, ptr %32, align 2
+ %33 = add nuw i32 %8, 1
+ %34 = icmp eq i32 %33, %3
+ br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: four_shorts_split_op:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.or
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.or
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.xor
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.xor
+; CHECK: i32.store16
+define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourShorts, ptr %1, i32 %8
+ %10 = load i16, ptr %9, align 2
+ %11 = getelementptr inbounds %struct.FourShorts, ptr %2, i32 %8
+ %12 = load i16, ptr %11, align 2
+ %13 = or i16 %12, %10
+ %14 = getelementptr inbounds %struct.FourShorts, ptr %0, i32 %8
+ store i16 %13, ptr %14, align 2
+ %15 = getelementptr inbounds i8, ptr %9, i32 2
+ %16 = load i16, ptr %15, align 2
+ %17 = getelementptr inbounds i8, ptr %11, i32 2
+ %18 = load i16, ptr %17, align 2
+ %19 = or i16 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 2
+ store i16 %19, ptr %20, align 2
+ %21 = getelementptr inbounds i8, ptr %9, i32 4
+ %22 = load i16, ptr %21, align 2
+ %23 = getelementptr inbounds i8, ptr %11, i32 4
+ %24 = load i16, ptr %23, align 2
+ %25 = xor i16 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 4
+ store i16 %25, ptr %26, align 2
+ %27 = getelementptr inbounds i8, ptr %9, i32 6
+ %28 = load i16, ptr %27, align 2
+ %29 = getelementptr inbounds i8, ptr %11, i32 6
+ %30 = load i16, ptr %29, align 2
+ %31 = xor i16 %30, %28
+ %32 = getelementptr inbounds i8, ptr %14, i32 6
+ store i16 %31, ptr %32, align 2
+ %33 = add nuw i32 %8, 1
+ %34 = icmp eq i32 %33, %3
+ br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: four_shorts_interleave_op:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.or
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.xor
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.or
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.xor
+; CHECK: i32.store16
+define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourShorts, ptr %1, i32 %8
+ %10 = load i16, ptr %9, align 2
+ %11 = getelementptr inbounds %struct.FourShorts, ptr %2, i32 %8
+ %12 = load i16, ptr %11, align 2
+ %13 = or i16 %12, %10
+ %14 = getelementptr inbounds %struct.FourShorts, ptr %0, i32 %8
+ store i16 %13, ptr %14, align 2
+ %15 = getelementptr inbounds i8, ptr %9, i32 2
+ %16 = load i16, ptr %15, align 2
+ %17 = getelementptr inbounds i8, ptr %11, i32 2
+ %18 = load i16, ptr %17, align 2
+ %19 = xor i16 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 2
+ store i16 %19, ptr %20, align 2
+ %21 = getelementptr inbounds i8, ptr %9, i32 4
+ %22 = load i16, ptr %21, align 2
+ %23 = getelementptr inbounds i8, ptr %11, i32 4
+ %24 = load i16, ptr %23, align 2
+ %25 = or i16 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 4
+ store i16 %25, ptr %26, align 2
+ %27 = getelementptr inbounds i8, ptr %9, i32 6
+ %28 = load i16, ptr %27, align 2
+ %29 = getelementptr inbounds i8, ptr %11, i32 6
+ %30 = load i16, ptr %29, align 2
+ %31 = xor i16 %30, %28
+ %32 = getelementptr inbounds i8, ptr %14, i32 6
+ store i16 %31, ptr %32, align 2
+ %33 = add nuw i32 %8, 1
+ %34 = icmp eq i32 %33, %3
+ br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: five_shorts:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %39, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FiveShorts, ptr %1, i32 %8
+ %10 = load i16, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.FiveShorts, ptr %2, i32 %8
+ %12 = load i16, ptr %11, align 1
+ %13 = sub i16 %10, %12
+ %14 = getelementptr inbounds %struct.FiveShorts, ptr %0, i32 %8
+ store i16 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i16, ptr %9, i32 1
+ %16 = load i16, ptr %15, align 1
+ %17 = getelementptr inbounds i16, ptr %11, i32 1
+ %18 = load i16, ptr %17, align 1
+ %19 = sub i16 %16, %18
+ %20 = getelementptr inbounds i16, ptr %14, i32 1
+ store i16 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i16, ptr %9, i32 2
+ %22 = load i16, ptr %21, align 1
+ %23 = getelementptr inbounds i16, ptr %11, i32 2
+ %24 = load i16, ptr %23, align 1
+ %25 = sub i16 %22, %24
+ %26 = getelementptr inbounds i16, ptr %14, i32 2
+ store i16 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i16, ptr %9, i32 3
+ %28 = load i16, ptr %27, align 1
+ %29 = getelementptr inbounds i16, ptr %11, i32 3
+ %30 = load i16, ptr %29, align 1
+ %31 = sub i16 %28, %30
+ %32 = getelementptr inbounds i16, ptr %14, i32 3
+ store i16 %31, ptr %32, align 1
+ %33 = getelementptr inbounds i16, ptr %9, i32 4
+ %34 = load i16, ptr %33, align 1
+ %35 = getelementptr inbounds i16, ptr %11, i32 4
+ %36 = load i16, ptr %35, align 1
+ %37 = sub i16 %34, %36
+ %38 = getelementptr inbounds i16, ptr %14, i32 4
+ store i16 %37, ptr %38, align 1
+ %39 = add nuw i32 %8, 1
+ %40 = icmp eq i32 %39, %3
+ br i1 %40, label %6, label %7
+}
+
+; CHECK-LABEL: two_bytes_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %21, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.TwoBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.TwoBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = mul i8 %12, %10
+ %14 = getelementptr inbounds %struct.TwoBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = mul i8 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = add nuw i32 %8, 1
+ %22 = icmp eq i32 %21, %3
+ br i1 %22, label %6, label %7
+}
+
+; CHECK-LABEL: two_bytes_vary_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %21, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.TwoBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.TwoBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = mul i8 %12, %10
+ %14 = getelementptr inbounds %struct.TwoBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = sub i8 %16, %18
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = add nuw i32 %8, 1
+ %22 = icmp eq i32 %21, %3
+ br i1 %22, label %6, label %7
+}
+
+; CHECK-LABEL: three_bytes_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+define hidden void @three_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %27, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.ThreeBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.ThreeBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = and i8 %12, %10
+ %14 = getelementptr inbounds %struct.ThreeBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = and i8 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = and i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = add nuw i32 %8, 1
+ %28 = icmp eq i32 %27, %3
+ br i1 %28, label %6, label %7
+}
+
+; CHECK-LABEL: three_bytes_interleave_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %27, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.ThreeBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.ThreeBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = add i8 %12, %10
+ %14 = getelementptr inbounds %struct.ThreeBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = sub i8 %16, %18
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = add i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = add nuw i32 %8, 1
+ %28 = icmp eq i32 %27, %3
+ br i1 %28, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = and i8 %12, %10
+ %14 = getelementptr inbounds %struct.FourBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = and i8 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = and i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i8, ptr %9, i32 3
+ %28 = load i8, ptr %27, align 1
+ %29 = getelementptr inbounds i8, ptr %11, i32 3
+ %30 = load i8, ptr %29, align 1
+ %31 = and i8 %30, %28
+ %32 = getelementptr inbounds i8, ptr %14, i32 3
+ store i8 %31, ptr %32, align 1
+ %33 = add nuw i32 %8, 1
+ %34 = icmp eq i32 %33, %3
+ br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_split_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = mul i8 %12, %10
+ %14 = getelementptr inbounds %struct.FourBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = mul i8 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = sub i8 %22, %24
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i8, ptr %9, i32 3
+ %28 = load i8, ptr %27, align 1
+ %29 = getelementptr inbounds i8, ptr %11, i32 3
+ %30 = load i8, ptr %29, align 1
+ %31 = sub i8 %28, %30
+ %32 = getelementptr inbounds i8, ptr %14, i32 3
+ store i8 %31, ptr %32, align 1
+ %33 = add nuw i32 %8, 1
+ %34 = icmp eq i32 %33, %3
+ br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_interleave_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = add i8 %12, %10
+ %14 = getelementptr inbounds %struct.FourBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = sub i8 %16, %18
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = add i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i8, ptr %9, i32 3
+ %28 = load i8, ptr %27, align 1
+ %29 = getelementptr inbounds i8, ptr %11, i32 3
+ %30 = load i8, ptr %29, align 1
+ %31 = sub i8 %28, %30
+ %32 = getelementptr inbounds i8, ptr %14, i32 3
+ store i8 %31, ptr %32, align 1
+ %33 = add nuw i32 %8, 1
+ %34 = icmp eq i32 %33, %3
+ br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: eight_bytes_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+define hidden void @eight_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %57, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.EightBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.EightBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = mul i8 %12, %10
+ %14 = getelementptr inbounds %struct.EightBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = mul i8 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = mul i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i8, ptr %9, i32 3
+ %28 = load i8, ptr %27, align 1
+ %29 = getelementptr inbounds i8, ptr %11, i32 3
+ %30 = load i8, ptr %29, align 1
+ %31 = mul i8 %30, %28
+ %32 = getelementptr inbounds i8, ptr %14, i32 3
+ store i8 %31, ptr %32, align 1
+ %33 = getelementptr inbounds i8, ptr %9, i32 4
+ %34 = load i8, ptr %33, align 1
+ %35 = getelementptr inbounds i8, ptr %11, i32 4
+ %36 = load i8, ptr %35, align 1
+ %37 = mul i8 %36, %34
+ %38 = getelementptr inbounds i8, ptr %14, i32 4
+ store i8 %37, ptr %38, align 1
+ %39 = getelementptr inbounds i8, ptr %9, i32 5
+ %40 = load i8, ptr %39, align 1
+ %41 = getelementptr inbounds i8, ptr %11, i32 5
+ %42 = load i8, ptr %41, align 1
+ %43 = mul i8 %42, %40
+ %44 = getelementptr inbounds i8, ptr %14, i32 5
+ store i8 %43, ptr %44, align 1
+ %45 = getelementptr inbounds i8, ptr %9, i32 6
+ %46 = load i8, ptr %45, align 1
+ %47 = getelementptr inbounds i8, ptr %11, i32 6
+ %48 = load i8, ptr %47, align 1
+ %49 = mul i8 %48, %46
+ %50 = getelementptr inbounds i8, ptr %14, i32 6
+ store i8 %49, ptr %50, align 1
+ %51 = getelementptr inbounds i8, ptr %9, i32 7
+ %52 = load i8, ptr %51, align 1
+ %53 = getelementptr inbounds i8, ptr %11, i32 7
+ %54 = load i8, ptr %53, align 1
+ %55 = mul i8 %54, %52
+ %56 = getelementptr inbounds i8, ptr %14, i32 7
+ store i8 %55, ptr %56, align 1
+ %57 = add nuw i32 %8, 1
+ %58 = icmp eq i32 %57, %3
+ br i1 %58, label %6, label %7
+}
+
+; CHECK-LABEL: eight_bytes_split_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @eight_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %57, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.EightBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.EightBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = add i8 %12, %10
+ %14 = getelementptr inbounds %struct.EightBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = add i8 %18, %16
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = add i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i8, ptr %9, i32 3
+ %28 = load i8, ptr %27, align 1
+ %29 = getelementptr inbounds i8, ptr %11, i32 3
+ %30 = load i8, ptr %29, align 1
+ %31 = add i8 %30, %28
+ %32 = getelementptr inbounds i8, ptr %14, i32 3
+ store i8 %31, ptr %32, align 1
+ %33 = getelementptr inbounds i8, ptr %9, i32 4
+ %34 = load i8, ptr %33, align 1
+ %35 = getelementptr inbounds i8, ptr %11, i32 4
+ %36 = load i8, ptr %35, align 1
+ %37 = sub i8 %34, %36
+ %38 = getelementptr inbounds i8, ptr %14, i32 4
+ store i8 %37, ptr %38, align 1
+ %39 = getelementptr inbounds i8, ptr %9, i32 5
+ %40 = load i8, ptr %39, align 1
+ %41 = getelementptr inbounds i8, ptr %11, i32 5
+ %42 = load i8, ptr %41, align 1
+ %43 = sub i8 %40, %42
+ %44 = getelementptr inbounds i8, ptr %14, i32 5
+ store i8 %43, ptr %44, align 1
+ %45 = getelementptr inbounds i8, ptr %9, i32 6
+ %46 = load i8, ptr %45, align 1
+ %47 = getelementptr inbounds i8, ptr %11, i32 6
+ %48 = load i8, ptr %47, align 1
+ %49 = sub i8 %46, %48
+ %50 = getelementptr inbounds i8, ptr %14, i32 6
+ store i8 %49, ptr %50, align 1
+ %51 = getelementptr inbounds i8, ptr %9, i32 7
+ %52 = load i8, ptr %51, align 1
+ %53 = getelementptr inbounds i8, ptr %11, i32 7
+ %54 = load i8, ptr %53, align 1
+ %55 = sub i8 %52, %54
+ %56 = getelementptr inbounds i8, ptr %14, i32 7
+ store i8 %55, ptr %56, align 1
+ %57 = add nuw i32 %8, 1
+ %58 = icmp eq i32 %57, %3
+ br i1 %58, label %6, label %7
+}
+
+; CHECK-LABEL: eight_bytes_interleave_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @eight_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %57, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.EightBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = getelementptr inbounds %struct.EightBytes, ptr %2, i32 %8
+ %12 = load i8, ptr %11, align 1
+ %13 = add i8 %12, %10
+ %14 = getelementptr inbounds %struct.EightBytes, ptr %0, i32 %8
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %9, i32 1
+ %16 = load i8, ptr %15, align 1
+ %17 = getelementptr inbounds i8, ptr %11, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = sub i8 %16, %18
+ %20 = getelementptr inbounds i8, ptr %14, i32 1
+ store i8 %19, ptr %20, align 1
+ %21 = getelementptr inbounds i8, ptr %9, i32 2
+ %22 = load i8, ptr %21, align 1
+ %23 = getelementptr inbounds i8, ptr %11, i32 2
+ %24 = load i8, ptr %23, align 1
+ %25 = add i8 %24, %22
+ %26 = getelementptr inbounds i8, ptr %14, i32 2
+ store i8 %25, ptr %26, align 1
+ %27 = getelementptr inbounds i8, ptr %9, i32 3
+ %28 = load i8, ptr %27, align 1
+ %29 = getelementptr inbounds i8, ptr %11, i32 3
+ %30 = load i8, ptr %29, align 1
+ %31 = sub i8 %28, %30
+ %32 = getelementptr inbounds i8, ptr %14, i32 3
+ store i8 %31, ptr %32, align 1
+ %33 = getelementptr inbounds i8, ptr %9, i32 4
+ %34 = load i8, ptr %33, align 1
+ %35 = getelementptr inbounds i8, ptr %11, i32 4
+ %36 = load i8, ptr %35, align 1
+ %37 = add i8 %36, %34
+ %38 = getelementptr inbounds i8, ptr %14, i32 4
+ store i8 %37, ptr %38, align 1
+ %39 = getelementptr inbounds i8, ptr %9, i32 5
+ %40 = load i8, ptr %39, align 1
+ %41 = getelementptr inbounds i8, ptr %11, i32 5
+ %42 = load i8, ptr %41, align 1
+ %43 = sub i8 %40, %42
+ %44 = getelementptr inbounds i8, ptr %14, i32 5
+ store i8 %43, ptr %44, align 1
+ %45 = getelementptr inbounds i8, ptr %9, i32 6
+ %46 = load i8, ptr %45, align 1
+ %47 = getelementptr inbounds i8, ptr %11, i32 6
+ %48 = load i8, ptr %47, align 1
+ %49 = add i8 %48, %46
+ %50 = getelementptr inbounds i8, ptr %14, i32 6
+ store i8 %49, ptr %50, align 1
+ %51 = getelementptr inbounds i8, ptr %9, i32 7
+ %52 = load i8, ptr %51, align 1
+ %53 = getelementptr inbounds i8, ptr %11, i32 7
+ %54 = load i8, ptr %53, align 1
+ %55 = sub i8 %52, %54
+ %56 = getelementptr inbounds i8, ptr %14, i32 7
+ store i8 %55, ptr %56, align 1
+ %57 = add nuw i32 %8, 1
+ %58 = icmp eq i32 %57, %3
+ br i1 %58, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_into_four_ints_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noundef %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %49, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = zext i8 %10 to i32
+ %12 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+ %13 = load i8, ptr %12, align 1
+ %14 = zext i8 %13 to i32
+ %15 = mul nuw nsw i32 %14, %11
+ %16 = getelementptr inbounds %struct.FourInts, ptr %0, i32 %8
+ %17 = load i32, ptr %16, align 4
+ %18 = add nsw i32 %15, %17
+ store i32 %18, ptr %16, align 4
+ %19 = getelementptr inbounds i8, ptr %9, i32 1
+ %20 = load i8, ptr %19, align 1
+ %21 = zext i8 %20 to i32
+ %22 = getelementptr inbounds i8, ptr %12, i32 1
+ %23 = load i8, ptr %22, align 1
+ %24 = zext i8 %23 to i32
+ %25 = mul nuw nsw i32 %24, %21
+ %26 = getelementptr inbounds i8, ptr %16, i32 4
+ %27 = load i32, ptr %26, align 4
+ %28 = add nsw i32 %25, %27
+ store i32 %28, ptr %26, align 4
+ %29 = getelementptr inbounds i8, ptr %9, i32 2
+ %30 = load i8, ptr %29, align 1
+ %31 = zext i8 %30 to i32
+ %32 = getelementptr inbounds i8, ptr %12, i32 2
+ %33 = load i8, ptr %32, align 1
+ %34 = zext i8 %33 to i32
+ %35 = mul nuw nsw i32 %34, %31
+ %36 = getelementptr inbounds i8, ptr %16, i32 8
+ %37 = load i32, ptr %36, align 4
+ %38 = add nsw i32 %35, %37
+ store i32 %38, ptr %36, align 4
+ %39 = getelementptr inbounds i8, ptr %9, i32 3
+ %40 = load i8, ptr %39, align 1
+ %41 = zext i8 %40 to i32
+ %42 = getelementptr inbounds i8, ptr %12, i32 3
+ %43 = load i8, ptr %42, align 1
+ %44 = zext i8 %43 to i32
+ %45 = mul nuw nsw i32 %44, %41
+ %46 = getelementptr inbounds i8, ptr %16, i32 12
+ %47 = load i32, ptr %46, align 4
+ %48 = add nsw i32 %45, %47
+ store i32 %48, ptr %46, align 4
+ %49 = add nuw i32 %8, 1
+ %50 = icmp eq i32 %49, %3
+ br i1 %50, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_into_four_ints_vary_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store
+define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %6, label %7
+
+6: ; preds = %7, %4
+ ret void
+
+7: ; preds = %4, %7
+ %8 = phi i32 [ %40, %7 ], [ 0, %4 ]
+ %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+ %10 = load i8, ptr %9, align 1
+ %11 = zext i8 %10 to i32
+ %12 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+ %13 = load i8, ptr %12, align 1
+ %14 = zext i8 %13 to i32
+ %15 = add nuw nsw i32 %14, %11
+ %16 = getelementptr inbounds %struct.FourInts, ptr %0, i32 %8
+ store i32 %15, ptr %16, align 4
+ %17 = getelementptr inbounds i8, ptr %9, i32 1
+ %18 = load i8, ptr %17, align 1
+ %19 = zext i8 %18 to i32
+ %20 = getelementptr inbounds i8, ptr %12, i32 1
+ %21 = load i8, ptr %20, align 1
+ %22 = zext i8 %21 to i32
+ %23 = sub nsw i32 %19, %22
+ %24 = getelementptr inbounds i8, ptr %16, i32 4
+ store i32 %23, ptr %24, align 4
+ %25 = getelementptr inbounds i8, ptr %9, i32 2
+ %26 = load i8, ptr %25, align 1
+ %27 = zext i8 %26 to i32
+ %28 = getelementptr inbounds i8, ptr %12, i32 2
+ %29 = load i8, ptr %28, align 1
+ %30 = zext i8 %29 to i32
+ %31 = mul nuw nsw i32 %30, %27
+ %32 = getelementptr inbounds i8, ptr %16, i32 8
+ store i32 %31, ptr %32, align 4
+ %33 = getelementptr inbounds i8, ptr %9, i32 3
+ %34 = load i8, ptr %33, align 1
+ %35 = getelementptr inbounds i8, ptr %12, i32 3
+ %36 = load i8, ptr %35, align 1
+ %37 = and i8 %36, %34
+ %38 = zext i8 %37 to i32
+ %39 = getelementptr inbounds i8, ptr %16, i32 12
+ store i32 %38, ptr %39, align 4
+ %40 = add nuw i32 %8, 1
+ %41 = icmp eq i32 %40, %3
+ br i1 %41, label %6, label %7
+}
+
+; CHECK-LABEL: scale_uv_row_down2:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.store8
+define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
+ %5 = icmp sgt i32 %3, 0
+ br i1 %5, label %6, label %19
+
+6: ; preds = %4, %6
+ %7 = phi i32 [ %17, %6 ], [ 0, %4 ]
+ %8 = phi ptr [ %15, %6 ], [ %0, %4 ]
+ %9 = phi ptr [ %16, %6 ], [ %2, %4 ]
+ %10 = getelementptr inbounds i8, ptr %8, i32 2
+ %11 = load i8, ptr %10, align 1
+ store i8 %11, ptr %9, align 1
+ %12 = getelementptr inbounds i8, ptr %8, i32 3
+ %13 = load i8, ptr %12, align 1
+ %14 = getelementptr inbounds i8, ptr %9, i32 1
+ store i8 %13, ptr %14, align 1
+ %15 = getelementptr inbounds i8, ptr %8, i32 4
+ %16 = getelementptr inbounds i8, ptr %9, i32 2
+ %17 = add nuw nsw i32 %7, 1
+ %18 = icmp eq i32 %17, %3
+ br i1 %18, label %19, label %6
+
+19: ; preds = %6, %4
+ ret void
+}
+
+; CHECK-LABEL: scale_uv_row_down2_box:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.shr_u
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.shr_u
+; CHECK: i32.store8
+define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
+ %5 = icmp sgt i32 %3, 0
+ br i1 %5, label %6, label %54
+
+6: ; preds = %4
+ %7 = add nsw i32 %1, 2
+ %8 = add nsw i32 %1, 1
+ %9 = add nsw i32 %1, 3
+ br label %10
+
+10: ; preds = %6, %10
+ %11 = phi i32 [ 0, %6 ], [ %52, %10 ]
+ %12 = phi ptr [ %0, %6 ], [ %50, %10 ]
+ %13 = phi ptr [ %2, %6 ], [ %51, %10 ]
+ %14 = load i8, ptr %12, align 1
+ %15 = zext i8 %14 to i16
+ %16 = getelementptr inbounds i8, ptr %12, i32 2
+ %17 = load i8, ptr %16, align 1
+ %18 = zext i8 %17 to i16
+ %19 = getelementptr inbounds i8, ptr %12, i32 %1
+ %20 = load i8, ptr %19, align 1
+ %21 = zext i8 %20 to i16
+ %22 = getelementptr inbounds i8, ptr %12, i32 %7
+ %23 = load i8, ptr %22, align 1
+ %24 = zext i8 %23 to i16
+ %25 = add nuw nsw i16 %15, 2
+ %26 = add nuw nsw i16 %25, %18
+ %27 = add nuw nsw i16 %26, %21
+ %28 = add nuw nsw i16 %27, %24
+ %29 = lshr i16 %28, 2
+ %30 = trunc nuw i16 %29 to i8
+ store i8 %30, ptr %13, align 1
+ %31 = getelementptr inbounds i8, ptr %12, i32 1
+ %32 = load i8, ptr %31, align 1
+ %33 = zext i8 %32 to i16
+ %34 = getelementptr inbounds i8, ptr %12, i32 3
+ %35 = load i8, ptr %34, align 1
+ %36 = zext i8 %35 to i16
+ %37 = getelementptr inbounds i8, ptr %12, i32 %8
+ %38 = load i8, ptr %37, align 1
+ %39 = zext i8 %38 to i16
+ %40 = getelementptr inbounds i8, ptr %12, i32 %9
+ %41 = load i8, ptr %40, align 1
+ %42 = zext i8 %41 to i16
+ %43 = add nuw nsw i16 %33, 2
+ %44 = add nuw nsw i16 %43, %36
+ %45 = add nuw nsw i16 %44, %39
+ %46 = add nuw nsw i16 %45, %42
+ %47 = lshr i16 %46, 2
+ %48 = trunc nuw i16 %47 to i8
+ %49 = getelementptr inbounds i8, ptr %13, i32 1
+ store i8 %48, ptr %49, align 1
+ %50 = getelementptr inbounds i8, ptr %12, i32 4
+ %51 = getelementptr inbounds i8, ptr %13, i32 2
+ %52 = add nuw nsw i32 %11, 1
+ %53 = icmp eq i32 %52, %3
+ br i1 %53, label %54, label %10
+
+54: ; preds = %10, %4
+ ret void
+}
+
+; CHECK-LABEL: scale_uv_row_down2_linear:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.shr_u
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.shr_u
+; CHECK: i32.store8
+define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
+ %5 = icmp sgt i32 %3, 0
+ br i1 %5, label %6, label %34
+
+6: ; preds = %4, %6
+ %7 = phi i32 [ %32, %6 ], [ 0, %4 ]
+ %8 = phi ptr [ %30, %6 ], [ %0, %4 ]
+ %9 = phi ptr [ %31, %6 ], [ %2, %4 ]
+ %10 = load i8, ptr %8, align 1
+ %11 = zext i8 %10 to i16
+ %12 = getelementptr inbounds i8, ptr %8, i32 2
+ %13 = load i8, ptr %12, align 1
+ %14 = zext i8 %13 to i16
+ %15 = add nuw nsw i16 %11, 1
+ %16 = add nuw nsw i16 %15, %14
+ %17 = lshr i16 %16, 1
+ %18 = trunc nuw i16 %17 to i8
+ store i8 %18, ptr %9, align 1
+ %19 = getelementptr inbounds i8, ptr %8, i32 1
+ %20 = load i8, ptr %19, align 1
+ %21 = zext i8 %20 to i16
+ %22 = getelementptr inbounds i8, ptr %8, i32 3
+ %23 = load i8, ptr %22, align 1
+ %24 = zext i8 %23 to i16
+ %25 = add nuw nsw i16 %21, 1
+ %26 = add nuw nsw i16 %25, %24
+ %27 = lshr i16 %26, 1
+ %28 = trunc nuw i16 %27 to i8
+ %29 = getelementptr inbounds i8, ptr %9, i32 1
+ store i8 %28, ptr %29, align 1
+ %30 = getelementptr inbounds i8, ptr %8, i32 4
+ %31 = getelementptr inbounds i8, ptr %9, i32 2
+ %32 = add nuw nsw i32 %7, 1
+ %33 = icmp eq i32 %32, %3
+ br i1 %33, label %34, label %6
+
+34: ; preds = %6, %4
+ ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/ref-test-func.ll b/llvm/test/CodeGen/WebAssembly/ref-test-func.ll
new file mode 100644
index 0000000..e4014ba
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/ref-test-func.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s --mtriple=wasm32-unknown-unknown -mcpu=mvp -mattr=+reference-types -verify-machineinstrs | FileCheck --check-prefixes CHECK,CHK32 %s
+; RUN: llc < %s --mtriple=wasm64-unknown-unknown -mcpu=mvp -mattr=+reference-types -verify-machineinstrs | FileCheck --check-prefixes CHECK,CHK64 %s
+
+define void @test_fpsig_void_void(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_void_void:
+; CHK32: .functype test_fpsig_void_void (i32) -> ()
+; CHK64: .functype test_fpsig_void_void (i64) -> ()
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: local.get 0
+; CHK64-NEXT: i32.wrap_i64
+; CHECK-NEXT: table.get __indirect_function_table
+; CHECK-NEXT: ref.test () -> ()
+; CHECK-NEXT: call use
+; CHECK-NEXT: # fallthrough-return
+entry:
+ %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func)
+ tail call void @use(i32 noundef %res) #3
+ ret void
+}
+
+define void @test_fpsig_return_i32(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_return_i32:
+; CHK32: .functype test_fpsig_return_i32 (i32) -> ()
+; CHK64: .functype test_fpsig_return_i32 (i64) -> ()
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: local.get 0
+; CHK64-NEXT: i32.wrap_i64
+; CHECK-NEXT: table.get __indirect_function_table
+; CHECK-NEXT: ref.test () -> (i32)
+; CHECK-NEXT: call use
+; CHECK-NEXT: # fallthrough-return
+entry:
+ %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, i32 0)
+ tail call void @use(i32 noundef %res) #3
+ ret void
+}
+
+define void @test_fpsig_return_i64(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_return_i64:
+; CHK32: .functype test_fpsig_return_i64 (i32) -> ()
+; CHK64: .functype test_fpsig_return_i64 (i64) -> ()
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: local.get 0
+; CHK64-NEXT: i32.wrap_i64
+; CHECK-NEXT: table.get __indirect_function_table
+; CHECK-NEXT: ref.test () -> (i64)
+; CHECK-NEXT: call use
+; CHECK-NEXT: # fallthrough-return
+entry:
+ %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, i64 0)
+ tail call void @use(i32 noundef %res) #3
+ ret void
+}
+
+define void @test_fpsig_return_f32(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_return_f32:
+; CHK32: .functype test_fpsig_return_f32 (i32) -> ()
+; CHK64: .functype test_fpsig_return_f32 (i64) -> ()
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: local.get 0
+; CHK64-NEXT: i32.wrap_i64
+; CHECK-NEXT: table.get __indirect_function_table
+; CHECK-NEXT: ref.test () -> (f32)
+; CHECK-NEXT: call use
+; CHECK-NEXT: # fallthrough-return
+entry:
+ %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, float 0.)
+ tail call void @use(i32 noundef %res) #3
+ ret void
+}
+
+define void @test_fpsig_return_f64(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_return_f64:
+; CHK32: .functype test_fpsig_return_f64 (i32) -> ()
+; CHK64: .functype test_fpsig_return_f64 (i64) -> ()
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: local.get 0
+; CHK64-NEXT: i32.wrap_i64
+; CHECK-NEXT: table.get __indirect_function_table
+; CHECK-NEXT: ref.test () -> (f64)
+; CHECK-NEXT: call use
+; CHECK-NEXT: # fallthrough-return
+entry:
+ %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, double 0.)
+ tail call void @use(i32 noundef %res) #3
+ ret void
+}
+
+
+define void @test_fpsig_param_i32(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_param_i32:
+; CHK32: .functype test_fpsig_param_i32 (i32) -> ()
+; CHK64: .functype test_fpsig_param_i32 (i64) -> ()
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: local.get 0
+; CHK64-NEXT: i32.wrap_i64
+; CHECK-NEXT: table.get __indirect_function_table
+; CHECK-NEXT: ref.test (f64) -> ()
+; CHECK-NEXT: call use
+; CHECK-NEXT: # fallthrough-return
+entry:
+ %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, token poison, double 0.)
+ tail call void @use(i32 noundef %res) #3
+ ret void
+}
+
+
+define void @test_fpsig_multiple_params_and_returns(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_multiple_params_and_returns:
+; CHK32: .functype test_fpsig_multiple_params_and_returns (i32) -> ()
+; CHK64: .functype test_fpsig_multiple_params_and_returns (i64) -> ()
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: local.get 0
+; CHK64-NEXT: i32.wrap_i64
+; CHECK-NEXT: table.get __indirect_function_table
+; CHECK-NEXT: ref.test (i64, f32, i64) -> (i32, i64, f32, f64)
+; CHECK-NEXT: call use
+; CHECK-NEXT: # fallthrough-return
+entry:
+ %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, i32 0, i64 0, float 0., double 0., token poison, i64 0, float 0., i64 0)
+ tail call void @use(i32 noundef %res) #3
+ ret void
+}
+
+
+define void @test_fpsig_ptrs(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_ptrs:
+; CHK32: .functype test_fpsig_ptrs (i32) -> ()
+; CHK64: .functype test_fpsig_ptrs (i64) -> ()
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: local.get 0
+; CHK64-NEXT: i32.wrap_i64
+; CHECK-NEXT: table.get __indirect_function_table
+; CHK32-NEXT: ref.test (i32, i32) -> (i32)
+; CHK64-NEXT: ref.test (i64, i64) -> (i64)
+; CHECK-NEXT: call use
+; CHECK-NEXT: # fallthrough-return
+entry:
+ %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, ptr null, token poison, ptr null, ptr null)
+ tail call void @use(i32 noundef %res) #3
+ ret void
+}
+
+
+declare void @use(i32 noundef) local_unnamed_addr #1
diff --git a/llvm/test/CodeGen/WebAssembly/removed-terminator.ll b/llvm/test/CodeGen/WebAssembly/removed-terminator.ll
new file mode 100644
index 0000000..188f6f6
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/removed-terminator.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+define void @test(i1 %x) {
+; CHECK-LABEL: test:
+; CHECK: .functype test (i32) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32.const -1
+; CHECK-NEXT: i32.xor
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: drop
+; CHECK-NEXT: # %bb.1: # %exit
+; CHECK-NEXT: return
+ %y = xor i1 %x, true
+ ; This br_if's operand (%y) is stackified in RegStackify. But this terminator
+ ; will be removed in CFGSort after that. We need to make sure we unstackify %y
+ ; so that it can be dropped in ExplicitLocals.
+ br i1 %y, label %exit, label %exit
+
+exit:
+ ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
index 8459ec8..b355a0d 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
@@ -441,3 +441,31 @@ define <2 x double> @promote_mixed_v2f64(<4 x float> %x, <4 x float> %y) {
%a = fpext <2 x float> %v to <2 x double>
ret <2 x double> %a
}
+
+define <4 x float> @convert_u_v4f32_maybeneg(<4 x i32> %x) {
+; CHECK-LABEL: convert_u_v4f32_maybeneg:
+; CHECK: .functype convert_u_v4f32_maybeneg (v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32x4.shr_s
+; CHECK-NEXT: f32x4.convert_i32x4_u
+; CHECK-NEXT: # fallthrough-return
+ %a = ashr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+ %b = uitofp <4 x i32> %a to <4 x float>
+ ret <4 x float> %b
+}
+
+define <4 x float> @convert_u_v4f32_nonneg(<4 x i32> %x) {
+; CHECK-LABEL: convert_u_v4f32_nonneg:
+; CHECK: .functype convert_u_v4f32_nonneg (v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32x4.shr_u
+; CHECK-NEXT: f32x4.convert_i32x4_s
+; CHECK-NEXT: # fallthrough-return
+ %a = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+ %b = uitofp <4 x i32> %a to <4 x float>
+ ret <4 x float> %b
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll
index c93b8aa..eb39f90 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll
@@ -12,7 +12,7 @@ define <4 x float> @extend_to_float_low_i16x8_u(<8 x i16> %x) {
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32x4.extend_low_i16x8_u
-; CHECK-NEXT: f32x4.convert_i32x4_u
+; CHECK-NEXT: f32x4.convert_i32x4_s
; CHECK-NEXT: # fallthrough-return
%low = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extended = uitofp <4 x i16> %low to <4 x float>
@@ -25,7 +25,7 @@ define <4 x float> @extend_to_float_high_i16x8_u(<8 x i16> %x) {
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32x4.extend_high_i16x8_u
-; CHECK-NEXT: f32x4.convert_i32x4_u
+; CHECK-NEXT: f32x4.convert_i32x4_s
; CHECK-NEXT: # fallthrough-return
%high = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%extended = uitofp <4 x i16> %high to <4 x float>
@@ -39,7 +39,7 @@ define <4 x float> @extend_to_float_low_i8x16_u(<8 x i8> %x) {
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i16x8.extend_low_i8x16_u
; CHECK-NEXT: i32x4.extend_low_i16x8_u
-; CHECK-NEXT: f32x4.convert_i32x4_u
+; CHECK-NEXT: f32x4.convert_i32x4_s
; CHECK-NEXT: # fallthrough-return
%low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extended = uitofp <4 x i8> %low to <4 x float>
@@ -55,7 +55,7 @@ define <4 x float> @extend_to_float_high_i8x16_u(<8 x i8> %x) {
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i16x8.extend_low_i8x16_u
; CHECK-NEXT: i32x4.extend_low_i16x8_u
-; CHECK-NEXT: f32x4.convert_i32x4_u
+; CHECK-NEXT: f32x4.convert_i32x4_s
; CHECK-NEXT: # fallthrough-return
%high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%extended = uitofp <4 x i8> %high to <4 x float>
@@ -136,7 +136,7 @@ define <2 x double> @extend_to_double_low_i16x4_u(<4 x i16> %x) {
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32x4.extend_low_i16x8_u
-; CHECK-NEXT: f64x2.convert_low_i32x4_u
+; CHECK-NEXT: f64x2.convert_low_i32x4_s
; CHECK-NEXT: # fallthrough-return
%low = shufflevector <4 x i16> %x, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
%extended = uitofp <2 x i16> %low to <2 x double>
diff --git a/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll b/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll
index 3b3a460..ab6672e 100644
--- a/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll
+++ b/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll
@@ -1,4 +1,4 @@
-; RUN: sed -e s/.Cxx:// %s | llc -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=CXX,X64CXX
+; RUN: sed -e s/.Cxx:// %s | llc -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=CXX
; RUN: sed -e s/.Seh:// %s | llc -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=SEH
; RUN: %if aarch64-registered-target %{ sed -e s/.Cxx:// %s | llc -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefix=CXX %}
; RUN: %if aarch64-registered-target %{ sed -e s/.Seh:// %s | llc -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefix=SEH %}
@@ -49,18 +49,14 @@ catch.body.2:
; CXX-NEXT: .[[ENTRY:long|word]] .Lfunc_begin0@IMGREL
; CXX-NEXT: .[[ENTRY]] -1
; CXX-NEXT: .[[ENTRY]] .Ltmp0@IMGREL
-; X64CXX-SAME: +1
; CXX-NEXT: .[[ENTRY]] 1
; CXX-NEXT: .[[ENTRY]] .Ltmp1@IMGREL
-; X64CXX-SAME: +1
; CXX-NEXT: .[[ENTRY]] -1
; CXX-NEXT: .[[ENTRY]] "?catch$3@?0?test@4HA"@IMGREL
; CXX-NEXT: .[[ENTRY]] 2
; CXX-NEXT: .[[ENTRY]] .Ltmp2@IMGREL
-; X64CXX-SAME: +1
; CXX-NEXT: .[[ENTRY]] 3
; CXX-NEXT: .[[ENTRY]] .Ltmp3@IMGREL
-; X64CXX-SAME: +1
; CXX-NEXT: .[[ENTRY]] 2
; CXX-NEXT: .[[ENTRY]] "?catch$5@?0?test@4HA"@IMGREL
; CXX-NEXT: .[[ENTRY]] 4
@@ -70,19 +66,19 @@ catch.body.2:
; SEH: .LBB0_[[CATCH:[0-9]+]]: {{.*}} %catch.body
; SEH-LABEL: .Llsda_begin0:
; SEH-NEXT: .[[ENTRY:long|word]] .Ltmp0@IMGREL
-; SEH-NEXT: .[[ENTRY]] .Ltmp1@IMGREL+1
+; SEH-NEXT: .[[ENTRY]] .Ltmp1@IMGREL
; SEH-NEXT: .[[ENTRY]] dummy_filter@IMGREL
; SEH-NEXT: .[[ENTRY]] .LBB0_[[CATCH]]@IMGREL
; SEH-NEXT: .[[ENTRY]] .Ltmp0@IMGREL
-; SEH-NEXT: .[[ENTRY]] .Ltmp1@IMGREL+1
+; SEH-NEXT: .[[ENTRY]] .Ltmp1@IMGREL
; SEH-NEXT: .[[ENTRY]] dummy_filter@IMGREL
; SEH-NEXT: .[[ENTRY]] .LBB0_[[CATCH2]]@IMGREL
; SEH-NEXT: .[[ENTRY]] .Ltmp2@IMGREL
-; SEH-NEXT: .[[ENTRY]] .Ltmp3@IMGREL+1
+; SEH-NEXT: .[[ENTRY]] .Ltmp3@IMGREL
; SEH-NEXT: .[[ENTRY]] "?dtor$[[DTOR:[0-9]+]]@?0?test@4HA"@IMGREL
; SEH-NEXT: .[[ENTRY]] 0
; SEH-NEXT: .[[ENTRY]] .Ltmp2@IMGREL
-; SEH-NEXT: .[[ENTRY]] .Ltmp3@IMGREL+1
+; SEH-NEXT: .[[ENTRY]] .Ltmp3@IMGREL
; SEH-NEXT: .[[ENTRY]] dummy_filter@IMGREL
; SEH-NEXT: .[[ENTRY]] .LBB0_[[CATCH2]]@IMGREL
; SEH-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll b/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll
index 2bd004e..9de79ee 100644
--- a/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll
+++ b/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll
@@ -1,4 +1,5 @@
-; RUN: llc %s --mtriple=x86_64-pc-windows-msvc -o - | FileCheck %s
+; RUN: llc %s --mtriple=x86_64-pc-windows-msvc -o - | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: %if aarch64-registered-target %{ llc %s --mtriple=aarch64-pc-windows-msvc -o - | FileCheck %s --check-prefixes=CHECK,ARM64 %}
; Tests the fixed object layouts when two catchpads re-use the same stack
; allocation for this catch objects.
@@ -18,27 +19,36 @@
; }
; ```
-; Minimum stack alloc is 64 bytes, so no change there.
; CHECK-LABEL: calls_boom:
-; CHECK: subq $64, %rsp
-; CHECK: .seh_stackalloc 64
+; Minimum stack alloc is 64 bytes, so no change there.
+; X64: subq $64, %rsp
+; X64: .seh_stackalloc 64
+; Only need 48 bytes on the stack, not 64.
+; ARM64: sub sp, sp, #48
+; ARM64: .seh_stackalloc 48
; Both the catch blocks load from the same address.
; CHECK-LABEL: "?catch$3@?0?calls_boom@4HA":
-; CHECK: movq -8(%rbp), %rax
+; X64: movq -8(%rbp), %rax
+; ARM64: ldr x8, [x29, #24]
; CHECK-LABEL: "?catch$4@?0?calls_boom@4HA":
-; CHECK: movq -8(%rbp), %rax
+; X64: movq -8(%rbp), %rax
+; ARM64: ldr x8, [x29, #24]
-; There's enough space for the UnwindHelp to be at 48 instead of 40
; CHECK-LABEL: $cppxdata$calls_boom:
-; CHECK: .long 48 # UnwindHelp
+; There's enough space for the UnwindHelp to be at 48 instead of 40
+; X64: .long 48 # UnwindHelp
+; There's enough space for the UnwindHelp to be at -16 instead of -32
+; ARM64: .word -16 // UnwindHelp
; Both catches have the same object offset.
; CHECK-LABEL: $handlerMap$0$calls_boom:
-; CHECK: .long 56 # CatchObjOffset
-; CHECK-NEXT: .long "?catch$3@?0?calls_boom@4HA"@IMGREL # Handler
-; CHECK: .long 56 # CatchObjOffset
-; CHECK-NEXT: .long "?catch$4@?0?calls_boom@4HA"@IMGREL # Handler
+; X64: .long 56 # CatchObjOffset
+; ARM64: .word -8 // CatchObjOffset
+; CHECK-NEXT: "?catch$3@?0?calls_boom@4HA"@IMGREL
+; X64: .long 56 # CatchObjOffset
+; ARM64: .word -8 // CatchObjOffset
+; CHECK-NEXT: "?catch$4@?0?calls_boom@4HA"@IMGREL
%rtti.TypeDescriptor2 = type { ptr, ptr, [3 x i8] }
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index 2911edf..d9064c6 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -1076,15 +1076,15 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: subl %esi, %eax
-; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: sbbl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
@@ -1107,15 +1107,15 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: subl %esi, %eax
-; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: sbbl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
@@ -1142,32 +1142,32 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 36(%ebp), %eax
; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl 24(%ebp), %edi
; X86-NEXT: movl 28(%ebp), %edx
-; X86-NEXT: movl 24(%ebp), %esi
-; X86-NEXT: subl 40(%ebp), %esi
+; X86-NEXT: subl 40(%ebp), %edi
; X86-NEXT: sbbl 44(%ebp), %edx
; X86-NEXT: sbbl 48(%ebp), %ecx
; X86-NEXT: sbbl 52(%ebp), %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: sarl $31, %edi
-; X86-NEXT: xorl %edi, %eax
-; X86-NEXT: xorl %edi, %ecx
-; X86-NEXT: xorl %edi, %edx
-; X86-NEXT: xorl %edi, %esi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: subl %esi, %ebx
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: sbbl %edx, %esi
-; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: xorl %esi, %eax
+; X86-NEXT: xorl %esi, %ecx
+; X86-NEXT: xorl %esi, %edx
+; X86-NEXT: xorl %esi, %edi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: subl %edi, %ebx
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: sbbl %ecx, %edx
-; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: sbbl %eax, %esi
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edi, 4(%eax)
; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -1203,32 +1203,32 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 36(%ebp), %eax
; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl 24(%ebp), %edi
; X86-NEXT: movl 28(%ebp), %edx
-; X86-NEXT: movl 24(%ebp), %esi
-; X86-NEXT: subl 40(%ebp), %esi
+; X86-NEXT: subl 40(%ebp), %edi
; X86-NEXT: sbbl 44(%ebp), %edx
; X86-NEXT: sbbl 48(%ebp), %ecx
; X86-NEXT: sbbl 52(%ebp), %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: sarl $31, %edi
-; X86-NEXT: xorl %edi, %eax
-; X86-NEXT: xorl %edi, %ecx
-; X86-NEXT: xorl %edi, %edx
-; X86-NEXT: xorl %edi, %esi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: subl %esi, %ebx
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: sbbl %edx, %esi
-; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: xorl %esi, %eax
+; X86-NEXT: xorl %esi, %ecx
+; X86-NEXT: xorl %esi, %edx
+; X86-NEXT: xorl %esi, %edi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: subl %edi, %ebx
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: sbbl %ecx, %edx
-; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: sbbl %eax, %esi
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edi, 4(%eax)
; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 217cceb..0de308a 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -1734,20 +1734,20 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-LABEL: not_avg_v16i8_wide_constants:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm1
-; SSE2-NEXT: movdqa (%rsi), %xmm2
+; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: movd %eax, %xmm4
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
; SSE2-NEXT: movd %eax, %xmm5
@@ -1762,6 +1762,9 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-NEXT: movd %eax, %xmm8
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
+; SSE2-NEXT: movd %eax, %xmm10
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: decl %eax
; SSE2-NEXT: movd %eax, %xmm9
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
@@ -1771,9 +1774,6 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-NEXT: movd %eax, %xmm12
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: movd %eax, %xmm10
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: decl %eax
; SSE2-NEXT: movd %eax, %xmm13
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
@@ -1783,43 +1783,45 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-NEXT: movd %eax, %xmm15
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; SSE2-NEXT: movapd %xmm4, %xmm5
; SSE2-NEXT: andpd %xmm1, %xmm5
; SSE2-NEXT: xorpd %xmm4, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: paddw %xmm5, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; SSE2-NEXT: movapd %xmm0, %xmm3
-; SSE2-NEXT: andpd %xmm2, %xmm3
-; SSE2-NEXT: xorpd %xmm0, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm2
-; SSE2-NEXT: paddw %xmm3, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE2-NEXT: movapd %xmm2, %xmm3
+; SSE2-NEXT: andpd %xmm0, %xmm3
+; SSE2-NEXT: xorpd %xmm2, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: paddw %xmm3, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: packuswb %xmm0, %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: retq
;
@@ -1829,74 +1831,75 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX1-NEXT: vpextrd $2, %xmm5, %ecx
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; AVX1-NEXT: vpextrd $2, %xmm4, %eax
-; AVX1-NEXT: vpextrw $3, %xmm3, %edx
+; AVX1-NEXT: vpextrw $7, %xmm3, %edx
+; AVX1-NEXT: vpextrw $6, %xmm3, %ecx
+; AVX1-NEXT: vpextrw $5, %xmm3, %eax
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vmovd %edx, %xmm4
-; AVX1-NEXT: vpextrw $2, %xmm3, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm5
-; AVX1-NEXT: vpextrw $1, %xmm3, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm6
-; AVX1-NEXT: vpextrw $0, %xmm3, %edx
+; AVX1-NEXT: vpextrw $4, %xmm3, %edx
+; AVX1-NEXT: decl %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm5
+; AVX1-NEXT: vpextrw $1, %xmm3, %ecx
+; AVX1-NEXT: decl %eax
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vpextrw $0, %xmm3, %eax
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vmovd %edx, %xmm7
-; AVX1-NEXT: vpextrw $3, %xmm2, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm8
-; AVX1-NEXT: vpextrw $2, %xmm2, %edx
+; AVX1-NEXT: vpextrw $3, %xmm3, %edx
+; AVX1-NEXT: decq %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm8
+; AVX1-NEXT: vpextrw $2, %xmm3, %ecx
+; AVX1-NEXT: decq %rax
+; AVX1-NEXT: vmovq %rax, %xmm3
+; AVX1-NEXT: vpextrw $7, %xmm2, %eax
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vmovd %edx, %xmm9
-; AVX1-NEXT: vpextrw $1, %xmm2, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm10
-; AVX1-NEXT: vpextrw $0, %xmm2, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm11
-; AVX1-NEXT: vpextrw $5, %xmm3, %edx
+; AVX1-NEXT: vpextrw $6, %xmm2, %edx
+; AVX1-NEXT: decl %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm10
+; AVX1-NEXT: vpextrw $5, %xmm2, %ecx
+; AVX1-NEXT: decl %eax
+; AVX1-NEXT: vmovd %eax, %xmm11
+; AVX1-NEXT: vpextrw $4, %xmm2, %eax
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vmovd %edx, %xmm12
-; AVX1-NEXT: vpextrw $4, %xmm3, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm13
-; AVX1-NEXT: vpextrw $5, %xmm2, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm14
-; AVX1-NEXT: vpextrw $4, %xmm2, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm15
-; AVX1-NEXT: vpextrw $7, %xmm3, %edx
+; AVX1-NEXT: vpextrw $1, %xmm2, %edx
; AVX1-NEXT: decl %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm3
-; AVX1-NEXT: vpextrw $7, %xmm2, %ecx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
+; AVX1-NEXT: vmovd %ecx, %xmm13
+; AVX1-NEXT: vpextrw $0, %xmm2, %ecx
+; AVX1-NEXT: decl %eax
+; AVX1-NEXT: vmovd %eax, %xmm14
+; AVX1-NEXT: vpextrw $3, %xmm2, %eax
+; AVX1-NEXT: decq %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm15
+; AVX1-NEXT: vpextrw $2, %xmm2, %edx
+; AVX1-NEXT: decq %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
; AVX1-NEXT: decl %eax
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX1-NEXT: vmovd %eax, %xmm5
-; AVX1-NEXT: decl %ecx
+; AVX1-NEXT: decl %edx
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX1-NEXT: vmovd %ecx, %xmm7
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX1-NEXT: vmovddup {{.*#+}} ymm3 = ymm6[0,0,2,2]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
+; AVX1-NEXT: vmovd %edx, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm1
; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0
diff --git a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
index ab9fa22..24d3030 100644
--- a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
+++ b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
@@ -48,6 +48,6 @@ return: ; preds = %catch, %entry
; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
; CHECK-NEXT: .Llsda_begin0:
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long 1
; CHECK-NEXT: .long .LBB0_[[catch]]@IMGREL
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
index c4c194e..7855ff2 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
@@ -121,7 +121,6 @@ define void @f_non_leaf(i32 %x, i32 %y) !prof !14 {
; WIN64-NEXT: # encoding: [0xeb,A]
; WIN64-NEXT: # fixup A - offset: 1, value: foo, kind: FK_PCRel_1
; WIN64-NEXT: .LBB1_2: # %bb2
-; WIN64-NEXT: nop # encoding: [0x90]
; WIN64-NEXT: .seh_startepilogue
; WIN64-NEXT: popq %rbx # encoding: [0x5b]
; WIN64-NEXT: .seh_endepilogue
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll
index 9c1d830..2859a87 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll
@@ -121,7 +121,6 @@ define void @f_non_leaf(i32 %x, i32 %y) optsize {
; WIN64-NEXT: # encoding: [0xeb,A]
; WIN64-NEXT: # fixup A - offset: 1, value: foo, kind: FK_PCRel_1
; WIN64-NEXT: .LBB1_2: # %bb2
-; WIN64-NEXT: nop # encoding: [0x90]
; WIN64-NEXT: .seh_startepilogue
; WIN64-NEXT: popq %rbx # encoding: [0x5b]
; WIN64-NEXT: .seh_endepilogue
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 661e7bb..455b72d 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -172,10 +172,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%ebp), %ecx
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl 52(%ebp), %esi
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl %ecx, %esi
; X86-NEXT: xorl %edx, %esi
; X86-NEXT: movl 48(%ebp), %ecx
; X86-NEXT: xorl %edx, %ecx
@@ -204,45 +203,45 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sete %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
-; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %ecx
; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %esi, %esi
-; X86-NEXT: cmovel %edx, %ecx
+; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: bsrl %edi, %edi
; X86-NEXT: xorl $31, %edi
-; X86-NEXT: addl $32, %edi
+; X86-NEXT: orl $32, %edi
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %edi
-; X86-NEXT: addl $64, %edi
+; X86-NEXT: orl $64, %edi
; X86-NEXT: movl %eax, %edx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %esi, %edx
; X86-NEXT: cmovnel %ecx, %edi
-; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %eax, %eax
-; X86-NEXT: cmovel %edx, %ecx
+; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: bsrl %ebx, %esi
; X86-NEXT: xorl $31, %esi
; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
+; X86-NEXT: orl $32, %edx
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: addl $64, %edx
+; X86-NEXT: orl $64, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: orl %eax, %esi
; X86-NEXT: cmovnel %ecx, %edx
@@ -380,9 +379,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $-1, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 370e1c6..859e924 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -173,17 +173,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: xorl $31, %edx
; X86-NEXT: bsrl 48(%ebp), %ecx
; X86-NEXT: xorl $31, %ecx
-; X86-NEXT: addl $32, %ecx
+; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: bsrl %edi, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: bsrl %ebx, %eax
; X86-NEXT: xorl $31, %eax
-; X86-NEXT: addl $32, %eax
+; X86-NEXT: orl $32, %eax
; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %edx, %eax
-; X86-NEXT: addl $64, %eax
+; X86-NEXT: orl $64, %eax
; X86-NEXT: movl 48(%ebp), %edx
; X86-NEXT: orl %esi, %edx
; X86-NEXT: cmovnel %ecx, %eax
@@ -193,7 +193,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl 32(%ebp), %ecx
; X86-NEXT: bsrl %ecx, %ecx
; X86-NEXT: xorl $31, %ecx
-; X86-NEXT: addl $32, %ecx
+; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: movl 28(%ebp), %edi
@@ -201,10 +201,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: xorl $31, %esi
; X86-NEXT: bsrl 24(%ebp), %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
+; X86-NEXT: orl $32, %edx
; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: addl $64, %edx
+; X86-NEXT: orl $64, %edx
; X86-NEXT: movl 32(%ebp), %esi
; X86-NEXT: orl %ebx, %esi
; X86-NEXT: cmovnel %ecx, %edx
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 0f66d42..953a5e7 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -171,15 +171,15 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: vmovdqa (%ecx), %xmm0
-; X86-NEXT: vpand (%edx), %xmm0, %xmm0
+; X86-NEXT: vmovdqa (%edx), %xmm0
+; X86-NEXT: vpand (%ecx), %xmm0, %xmm0
; X86-NEXT: vpextrb $6, %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: freeze_extractelement:
; X64: # %bb.0:
-; X64-NEXT: vmovdqa (%rsi), %xmm0
-; X64-NEXT: vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vpand (%rsi), %xmm0, %xmm0
; X64-NEXT: vpextrb $6, %xmm0, (%rdx)
; X64-NEXT: retq
%i0 = load <16 x i8>, ptr %origin0
@@ -198,8 +198,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: vmovdqa (%edx), %xmm0
-; X86-NEXT: vpand (%esi), %xmm0, %xmm0
+; X86-NEXT: vmovdqa (%esi), %xmm0
+; X86-NEXT: vpand (%edx), %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%ecx)
; X86-NEXT: vpextrb $6, %xmm0, (%eax)
; X86-NEXT: popl %esi
@@ -207,8 +207,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
;
; X64-LABEL: freeze_extractelement_escape:
; X64: # %bb.0:
-; X64-NEXT: vmovdqa (%rsi), %xmm0
-; X64-NEXT: vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vpand (%rsi), %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rcx)
; X64-NEXT: vpextrb $6, %xmm0, (%rdx)
; X64-NEXT: retq
@@ -239,8 +239,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
; X86-NEXT: movl 32(%ebp), %edx
; X86-NEXT: movl 12(%ebp), %esi
; X86-NEXT: movl 8(%ebp), %edi
-; X86-NEXT: vmovaps (%esi), %xmm0
-; X86-NEXT: vandps (%edi), %xmm0, %xmm0
+; X86-NEXT: vmovaps (%edi), %xmm0
+; X86-NEXT: vandps (%esi), %xmm0, %xmm0
; X86-NEXT: vmovaps %xmm0, (%esp)
; X86-NEXT: movzbl (%esp,%ecx), %ecx
; X86-NEXT: cmpb (%esp,%eax), %cl
@@ -255,8 +255,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
; X64: # %bb.0:
; X64-NEXT: andl $15, %ecx
; X64-NEXT: andl $15, %edx
-; X64-NEXT: vmovaps (%rsi), %xmm0
-; X64-NEXT: vandps (%rdi), %xmm0, %xmm0
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vandps (%rsi), %xmm0, %xmm0
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -24(%rsp,%rdx), %eax
; X64-NEXT: cmpb -24(%rsp,%rcx), %al
diff --git a/llvm/test/CodeGen/X86/noreturn-call-win64.ll b/llvm/test/CodeGen/X86/noreturn-call-win64.ll
index 57aa022..13be1f13 100644
--- a/llvm/test/CodeGen/X86/noreturn-call-win64.ll
+++ b/llvm/test/CodeGen/X86/noreturn-call-win64.ll
@@ -111,3 +111,15 @@ declare dso_local void @"??1MakeCleanup@@QEAA@XZ"(ptr)
; CHECK: # %unreachable
; CHECK: int3
; CHECK: .seh_handlerdata
+
+
+define dso_local void @last_call_no_return() {
+ call void @abort1()
+ unreachable
+}
+
+; CHECK-LABEL: last_call_no_return:
+; CHECK: callq abort1
+; CHECK-NEXT: int3
+; CHECK-NEXT: .seh_endproc
+
diff --git a/llvm/test/CodeGen/X86/peephole-copy.mir b/llvm/test/CodeGen/X86/peephole-copy.mir
index e24abf84..f399398 100644
--- a/llvm/test/CodeGen/X86/peephole-copy.mir
+++ b/llvm/test/CodeGen/X86/peephole-copy.mir
@@ -22,14 +22,14 @@ body: |
bb.0:
; CHECK-LABEL: name: c
; CHECK: [[MOV32ri:%[0-9]+]]:gr32_abcd = MOV32ri 512
- ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32 */, [[MOV32ri]], 1 /* reguse */, implicit-def early-clobber $df
+ ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, [[MOV32ri]], 1 /* reguse */, implicit-def early-clobber $df
; CHECK-NEXT: [[MOV32ri1:%[0-9]+]]:gr32_abcd = MOV32ri 512
- ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32 */, [[MOV32ri1]], 1 /* reguse */, implicit-def early-clobber $df
+ ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, [[MOV32ri1]], 1 /* reguse */, implicit-def early-clobber $df
; CHECK-NEXT: RET 0
%2 = MOV32ri 512
%0 = COPY %2
- INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32_ABCD */, %0:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
+ INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, %0:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
%1 = COPY %2
- INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32_ABCD */, %1:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
+ INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, %1:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
RET 0
...
diff --git a/llvm/test/CodeGen/X86/pr149841.ll b/llvm/test/CodeGen/X86/pr149841.ll
new file mode 100644
index 0000000..c17a617
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr149841.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.bar = type { [5 x ptr] }
+
+@global = external dso_local global %struct.bar
+
+define i1 @foo(ptr %arg, i1 %arg1) {
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0: # %bb
+; CHECK-NEXT: cmpq $global+1, %rdi
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: andb %sil, %al
+; CHECK-NEXT: retq
+bb:
+ #dbg_value(ptr @global, !3, !DIExpression(), !5)
+ %icmp = icmp ne ptr %arg, getelementptr inbounds nuw (i8, ptr @global, i64 1)
+ %select = select i1 %arg1, i1 %icmp, i1 false
+ ret i1 %select
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "x.c", directory: "/proc/self/cwd")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !1)
+!4 = distinct !DISubprogram(name: "x", scope: null, file: !1, spFlags: DISPFlagDefinition, unit: !0)
+!5 = !DILocation(line: 0, scope: !4)
+
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index 2d1b7fc..9728e13 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -42,10 +42,10 @@ define i64 @PR62286(i32 %a) {
; AVX2-LABEL: PR62286:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm1
-; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll b/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll
new file mode 100644
index 0000000..841061c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll
@@ -0,0 +1,47 @@
+; REQUIRES: asserts
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-windows-msvc < %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: warning: Guid:8314849053352128226 Name:inlinee does not exist in pseudo probe desc
+; CHECK: warning: Guid:6492337042787843907 Name:extract2 does not exist in pseudo probe desc
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+define void @extract1() !dbg !8 {
+entry:
+ call void @llvm.pseudoprobe(i64 6028998432455395745, i64 1, i32 0, i64 -1), !dbg !11
+ call void @llvm.pseudoprobe(i64 8314849053352128226, i64 1, i32 0, i64 -1), !dbg !12
+ ret void, !dbg !16
+}
+
+define void @extract2() !dbg !17 {
+entry:
+ call void @llvm.pseudoprobe(i64 6492337042787843907, i64 1, i32 0, i64 -1), !dbg !18
+ ret void, !dbg !18
+}
+
+declare void @llvm.pseudoprobe(i64, i64, i32, i64)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6}
+!llvm.pseudo_probe_desc = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: false, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/home/foo")
+!2 = !{i32 7, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 7, !"uwtable", i32 2}
+!6 = !{i32 7, !"frame-pointer", i32 2}
+!7 = !{i64 6028998432455395745, i64 281479271677951, !"extract1"}
+!8 = distinct !DISubprogram(name: "extract1", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, spFlags: DISPFlagDefinition, unit: !0)
+!9 = !DISubroutineType(types: !10)
+!10 = !{}
+!11 = !DILocation(line: 5, column: 3, scope: !8)
+!12 = !DILocation(line: 2, column: 1, scope: !13, inlinedAt: !14)
+!13 = distinct !DISubprogram(name: "inlinee", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0)
+!14 = distinct !DILocation(line: 5, column: 3, scope: !15)
+!15 = !DILexicalBlockFile(scope: !8, file: !1, discriminator: 455082007)
+!16 = !DILocation(line: 6, column: 1, scope: !8)
+!17 = distinct !DISubprogram(name: "extract2", scope: !1, file: !1, line: 8, type: !9, scopeLine: 8, spFlags: DISPFlagDefinition, unit: !0)
+!18 = !DILocation(line: 9, column: 1, scope: !17)
diff --git a/llvm/test/CodeGen/X86/seh-catch-all.ll b/llvm/test/CodeGen/X86/seh-catch-all.ll
index 5250bb9..4e25aab 100644
--- a/llvm/test/CodeGen/X86/seh-catch-all.ll
+++ b/llvm/test/CodeGen/X86/seh-catch-all.ll
@@ -40,7 +40,7 @@ catchall:
; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
; CHECK-NEXT: .Llsda_begin0:
; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL
-; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL+1
+; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL
; CHECK-NEXT: .long 1
; CHECK-NEXT: .long .LBB0_2@IMGREL
; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll
index d958580..cb85f39 100644
--- a/llvm/test/CodeGen/X86/seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/seh-catchpad.ll
@@ -123,23 +123,23 @@ __except.ret: ; preds = %catch.dispatch.7
; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
; CHECK-NEXT: .Llsda_begin0:
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long 1
; CHECK-NEXT: .long .LBB1_[[except1bb]]@IMGREL
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long "?filt$0@0@main@@"@IMGREL
; CHECK-NEXT: .long .LBB1_[[except2bb]]@IMGREL
; CHECK-NEXT: .long .Ltmp2@IMGREL
-; CHECK-NEXT: .long .Ltmp3@IMGREL+1
+; CHECK-NEXT: .long .Ltmp3@IMGREL
; CHECK-NEXT: .long "?dtor$[[finbb:[0-9]+]]@?0?main@4HA"@IMGREL
; CHECK-NEXT: .long 0
; CHECK-NEXT: .long .Ltmp2@IMGREL
-; CHECK-NEXT: .long .Ltmp3@IMGREL+1
+; CHECK-NEXT: .long .Ltmp3@IMGREL
; CHECK-NEXT: .long "?filt$0@0@main@@"@IMGREL
; CHECK-NEXT: .long .LBB1_3@IMGREL
; CHECK-NEXT: .long .Ltmp6@IMGREL
-; CHECK-NEXT: .long .Ltmp7@IMGREL+1
+; CHECK-NEXT: .long .Ltmp7@IMGREL
; CHECK-NEXT: .long "?filt$0@0@main@@"@IMGREL
; CHECK-NEXT: .long .LBB1_3@IMGREL
; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-except-finally.ll b/llvm/test/CodeGen/X86/seh-except-finally.ll
index 7f70655..539d776 100644
--- a/llvm/test/CodeGen/X86/seh-except-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-except-finally.ll
@@ -83,15 +83,15 @@ __try.cont: ; preds = %__except, %invoke.c
; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
; CHECK-NEXT: .Llsda_begin0:
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long "?dtor$2@?0?use_both@4HA"@IMGREL
; CHECK-NEXT: .long 0
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL
; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL
; CHECK-NEXT: .long .Ltmp4@IMGREL
-; CHECK-NEXT: .long .Ltmp5@IMGREL+1
+; CHECK-NEXT: .long .Ltmp5@IMGREL
; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL
; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL
; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-finally.ll b/llvm/test/CodeGen/X86/seh-finally.ll
index 41823df..6093e5e 100644
--- a/llvm/test/CodeGen/X86/seh-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-finally.ll
@@ -30,7 +30,7 @@ lpad: ; preds = %entry
; X64-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 # Number of call sites
; X64-NEXT: .Llsda_begin0:
; X64-NEXT: .long .Ltmp0@IMGREL # LabelStart
-; X64-NEXT: .long .Ltmp1@IMGREL+1 # LabelEnd
+; X64-NEXT: .long .Ltmp1@IMGREL # LabelEnd
; X64-NEXT: .long "?dtor$2@?0?main@4HA"@IMGREL # FinallyFunclet
; X64-NEXT: .long 0 # Null
; X64-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-safe-div.ll b/llvm/test/CodeGen/X86/seh-safe-div.ll
index 542d9f6..20169f8 100644
--- a/llvm/test/CodeGen/X86/seh-safe-div.ll
+++ b/llvm/test/CodeGen/X86/seh-safe-div.ll
@@ -60,6 +60,7 @@ __try.cont:
; CHECK: .Ltmp0:
; CHECK: leaq [[rloc:.*\(%rbp\)]], %rcx
; CHECK: callq try_body
+; CHECK: nop
; CHECK-NEXT: .Ltmp1
; CHECK: [[cont_bb:\.LBB0_[0-9]+]]:
; CHECK: movl [[rloc]], %eax
@@ -82,11 +83,11 @@ __try.cont:
; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
; CHECK-NEXT: .Llsda_begin0:
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long safe_div_filt0@IMGREL
; CHECK-NEXT: .long [[handler0]]@IMGREL
; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
; CHECK-NEXT: .long safe_div_filt1@IMGREL
; CHECK-NEXT: .long [[handler1]]@IMGREL
; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll b/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll
index 2c576df..5a6aeb6 100644
--- a/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll
+++ b/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll
@@ -56,8 +56,8 @@ declare dso_local void @printf(ptr, ...)
; CHECK-NEXT:$ip2state$test:
; CHECK-NEXT: .long .Lfunc_begin0@IMGREL # IP
; CHECK-NEXT: .long -1 # ToState
-; CHECK-NEXT: .long .Ltmp0@IMGREL+1 # IP
+; CHECK-NEXT: .long .Ltmp0@IMGREL # IP
; CHECK-NEXT: .long 0 # ToState
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1 # IP
+; CHECK-NEXT: .long .Ltmp1@IMGREL # IP
; CHECK-NEXT: .long -1 # ToState
diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll
index d273d09..c7cf9cb 100644
--- a/llvm/test/CodeGen/X86/select-optimize.ll
+++ b/llvm/test/CodeGen/X86/select-optimize.ll
@@ -229,9 +229,10 @@ define i32 @expensive_val_operand4(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
}
; Expensive cold value operand with unsafe-to-sink (due to lifetime-end marker) load (partial slice sinking).
-define i32 @expensive_val_operand5(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
+define i32 @expensive_val_operand5(i32 %b, i32 %y, i1 %cmp) {
; CHECK-LABEL: @expensive_val_operand5(
-; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[A:%.*]], align 8
+; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[A]], align 8
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[A]])
; CHECK-NEXT: [[CMP_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
; CHECK-NEXT: br i1 [[CMP_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]]
@@ -242,6 +243,7 @@ define i32 @expensive_val_operand5(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ]
; CHECK-NEXT: ret i32 [[SEL]]
;
+ %a = alloca i32
%load = load i32, ptr %a, align 8
call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %a)
%x = add i32 %load, %b
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index d2b292f..2ac2be5 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -119,8 +119,8 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-AVX2-NEXT: .LBB0_2: # %vector.body
; CHECK-AVX2-NEXT: # Parent Loop BB0_1 Depth=1
; CHECK-AVX2-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %ymm5
-; CHECK-AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %xmm5
+; CHECK-AVX2-NEXT: vmovdqu 1040(%rdx,%rsi), %xmm6
; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %rdi
; CHECK-AVX2-NEXT: vpextrq $1, %xmm6, %r8
; CHECK-AVX2-NEXT: vmovq %xmm5, %r9
diff --git a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
index e2de2ff..74fe07e 100644
--- a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
+++ b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
@@ -84,12 +84,12 @@ define void @pr66984(ptr %arg) personality ptr @__CxxFrameHandler3 {
; X86_64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X86_64-NEXT: .Ltmp0:
; X86_64-NEXT: callq throw
+; X86_64-NEXT: nop
; X86_64-NEXT: .Ltmp1:
; X86_64-NEXT: # %bb.1: # %bb14
; X86_64-NEXT: .LBB0_3: # Block address taken
; X86_64-NEXT: # %exit
; X86_64-NEXT: $ehgcr_0_3:
-; X86_64-NEXT: nop
; X86_64-NEXT: .seh_startepilogue
; X86_64-NEXT: addq $64, %rsp
; X86_64-NEXT: popq %rbp
diff --git a/llvm/test/CodeGen/X86/swap.ll b/llvm/test/CodeGen/X86/swap.ll
index e556900..1dc454dd 100644
--- a/llvm/test/CodeGen/X86/swap.ll
+++ b/llvm/test/CodeGen/X86/swap.ll
@@ -47,12 +47,10 @@ define dso_local void @onealloc_noreadback(ptr nocapture %a, ptr nocapture %b) l
entry:
%alloc = alloca [16 x i8], i8 2, align 1
%part2 = getelementptr inbounds [16 x i8], ptr %alloc, i64 1, i64 0
- call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %alloc)
- call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %part2)
+ call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloc)
call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %alloc, ptr align 1 %a, i64 16, i1 false)
tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %part2, ptr align 1 %b, i64 16, i1 false)
- call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %alloc)
- call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %part2)
+ call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloc)
ret void
}
@@ -115,8 +113,9 @@ define dso_local void @onealloc_readback_1(ptr nocapture %a, ptr nocapture %b) l
;
; AA-LABEL: onealloc_readback_1:
; AA: # %bb.0: # %entry
-; AA-NEXT: vmovups (%rsi), %xmm0
+; AA-NEXT: vmovups (%rdi), %xmm0
; AA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AA-NEXT: vmovups (%rsi), %xmm0
; AA-NEXT: vmovups %xmm0, (%rdi)
; AA-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/taildup-heapallocsite.ll b/llvm/test/CodeGen/X86/taildup-heapallocsite.ll
index 967e125..f3bef47 100644
--- a/llvm/test/CodeGen/X86/taildup-heapallocsite.ll
+++ b/llvm/test/CodeGen/X86/taildup-heapallocsite.ll
@@ -37,9 +37,11 @@ cond.end: ; preds = %entry, %cond.true
; CHECK: testq
; CHECK: je
; CHECK: callq alloc
+; CHECK-NEXT: nop
; CHECK-NEXT: [[L1:.Ltmp[0-9]+]]
; CHECK: jmp f2 # TAILCALL
; CHECK: callq alloc
+; CHECK-NEXT: nop
; CHECK-NEXT: [[L3:.Ltmp[0-9]+]]
; CHECK: jmp f2 # TAILCALL
diff --git a/llvm/test/CodeGen/X86/vec_extract.ll b/llvm/test/CodeGen/X86/vec_extract.ll
index 087cd30..9bd38db 100644
--- a/llvm/test/CodeGen/X86/vec_extract.ll
+++ b/llvm/test/CodeGen/X86/vec_extract.ll
@@ -104,6 +104,72 @@ entry:
}
declare <2 x double> @foo()
+define i64 @pr150117(<31 x i8> %a0) nounwind {
+; X86-LABEL: pr150117:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: shll $8, %edx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: shll $8, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: shll $16, %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: shll $24, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movd %esi, %xmm0
+; X86-NEXT: pinsrw $2, %edx, %xmm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shll $8, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: pinsrw $3, %ecx, %xmm0
+; X86-NEXT: movd %xmm0, %eax
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: movd %xmm0, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl
+;
+; X64-LABEL: pr150117:
+; X64: # %bb.0:
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %r8d
+; X64-NEXT: shll $8, %r8d
+; X64-NEXT: orl %edi, %r8d
+; X64-NEXT: shll $8, %esi
+; X64-NEXT: orl %edx, %esi
+; X64-NEXT: shll $16, %ecx
+; X64-NEXT: orl %esi, %ecx
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; X64-NEXT: shll $24, %edx
+; X64-NEXT: orl %ecx, %edx
+; X64-NEXT: movd %edx, %xmm0
+; X64-NEXT: pinsrw $2, %r8d, %xmm0
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; X64-NEXT: shll $8, %ecx
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: pinsrw $3, %ecx, %xmm0
+; X64-NEXT: movq %xmm0, %rax
+; X64-NEXT: retq
+ %shuffle = shufflevector <31 x i8> %a0, <31 x i8> zeroinitializer, <32 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %bitcast = bitcast <32 x i8> %shuffle to <4 x i64>
+ %elt = extractelement <4 x i64> %bitcast, i64 0
+ ret i64 %elt
+}
+
; OSS-Fuzz #15662
; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=15662
define <4 x i32> @ossfuzz15662(ptr %in) {
diff --git a/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll b/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll
index bfb9c43..0bf8370 100644
--- a/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll
+++ b/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll
@@ -103,15 +103,15 @@ handler2:
; X64: $ip2state$try_in_catch:
; X64-NEXT: .long .Lfunc_begin0@IMGREL
; X64-NEXT: .long -1
-; X64-NEXT: .long .Ltmp0@IMGREL+1
+; X64-NEXT: .long .Ltmp0@IMGREL
; X64-NEXT: .long 0
-; X64-NEXT: .long .Ltmp1@IMGREL+1
+; X64-NEXT: .long .Ltmp1@IMGREL
; X64-NEXT: .long -1
; X64-NEXT: .long "?catch$2@?0?try_in_catch@4HA"@IMGREL
; X64-NEXT: .long 1
-; X64-NEXT: .long .Ltmp2@IMGREL+1
+; X64-NEXT: .long .Ltmp2@IMGREL
; X64-NEXT: .long 2
-; X64-NEXT: .long .Ltmp3@IMGREL+1
+; X64-NEXT: .long .Ltmp3@IMGREL
; X64-NEXT: .long 1
; X64-NEXT: .long "?catch$4@?0?try_in_catch@4HA"@IMGREL
; X64-NEXT: .long 3
diff --git a/llvm/test/CodeGen/X86/win-catchpad.ll b/llvm/test/CodeGen/X86/win-catchpad.ll
index 2491946..62ea510 100644
--- a/llvm/test/CodeGen/X86/win-catchpad.ll
+++ b/llvm/test/CodeGen/X86/win-catchpad.ll
@@ -214,9 +214,9 @@ try.cont:
; X64: $ip2state$try_catch_catch:
; X64-NEXT: .long .Lfunc_begin0@IMGREL
; X64-NEXT: .long -1
-; X64-NEXT: .long .Ltmp0@IMGREL+1
+; X64-NEXT: .long .Ltmp0@IMGREL
; X64-NEXT: .long 0
-; X64-NEXT: .long .Ltmp1@IMGREL+1
+; X64-NEXT: .long .Ltmp1@IMGREL
; X64-NEXT: .long -1
; X64-NEXT: .long "?catch$[[catch1bb]]@?0?try_catch_catch@4HA"@IMGREL
; X64-NEXT: .long 1
@@ -357,9 +357,9 @@ try.cont:
; X64-LABEL: $ip2state$branch_to_normal_dest:
; X64-NEXT: .long .Lfunc_begin1@IMGREL
; X64-NEXT: .long -1
-; X64-NEXT: .long .Ltmp[[before_call]]@IMGREL+1
+; X64-NEXT: .long .Ltmp[[before_call]]@IMGREL
; X64-NEXT: .long 0
-; X64-NEXT: .long .Ltmp[[after_call]]@IMGREL+1
+; X64-NEXT: .long .Ltmp[[after_call]]@IMGREL
; X64-NEXT: .long -1
; X64-NEXT: .long "?catch$[[catchbb]]@?0?branch_to_normal_dest@4HA"@IMGREL
; X64-NEXT: .long 1
diff --git a/llvm/test/CodeGen/X86/win-cleanuppad.ll b/llvm/test/CodeGen/X86/win-cleanuppad.ll
index e3f7f5b..e9265a1 100644
--- a/llvm/test/CodeGen/X86/win-cleanuppad.ll
+++ b/llvm/test/CodeGen/X86/win-cleanuppad.ll
@@ -191,7 +191,7 @@ cleanup.outer: ; preds = %invoke.cont.1, %c
; X64-NEXT: .long 1
; X64-NEXT: .long .Ltmp6@IMGREL
; X64-NEXT: .long 0
-; X64-NEXT: .long .Ltmp7@IMGREL+1
+; X64-NEXT: .long .Ltmp7@IMGREL
; X64-NEXT: .long -1
attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/win32-eh-states.ll b/llvm/test/CodeGen/X86/win32-eh-states.ll
index 42ae5b0..e645199 100644
--- a/llvm/test/CodeGen/X86/win32-eh-states.ll
+++ b/llvm/test/CodeGen/X86/win32-eh-states.ll
@@ -86,11 +86,11 @@ catch.7:
; X64-LABEL: $ip2state$f:
; X64-NEXT: .long .Lfunc_begin0@IMGREL
; X64-NEXT: .long -1
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long 0
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long 1
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long -1
; X64-NEXT: .long "?catch${{.*}}@?0?f@4HA"@IMGREL
; X64-NEXT: .long 2
@@ -189,15 +189,15 @@ unreachable: ; preds = %entry
; X64-LABEL: $ip2state$g:
; X64-NEXT: .long .Lfunc_begin1@IMGREL
; X64-NEXT: .long -1
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long 1
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long -1
; X64-NEXT: .long "?catch${{.*}}@?0?g@4HA"@IMGREL
; X64-NEXT: .long 2
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long 3
-; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT: .long .Ltmp{{.*}}@IMGREL
; X64-NEXT: .long 2
diff --git a/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll b/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll
index bc5be7a..75f156f 100644
--- a/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll
+++ b/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll
@@ -8,8 +8,8 @@ define i32 @foobar() gc "statepoint-example" personality ptr @__gxx_personality_
; CHECK-NEXT: .seh_stackalloc 40
; CHECK-NEXT: .seh_endprologue
; CHECK-NEXT: callq bar
-; CHECK-NEXT: .Ltmp0:
; CHECK-NEXT: nop
+; CHECK-NEXT: .Ltmp0:
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .seh_endepilogue
diff --git a/llvm/test/CodeGen/X86/wineh-coreclr.ll b/llvm/test/CodeGen/X86/wineh-coreclr.ll
index baf5eaa..a3d0fde 100644
--- a/llvm/test/CodeGen/X86/wineh-coreclr.ll
+++ b/llvm/test/CodeGen/X86/wineh-coreclr.ll
@@ -38,6 +38,7 @@ entry:
; CHECK: [[test1_before_f1:.+]]:
; CHECK-NEXT: movl $1, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test1_after_f1:.+]]:
invoke void @f(i32 1)
to label %inner_try unwind label %finally
@@ -46,6 +47,7 @@ inner_try:
; CHECK: [[test1_before_f2:.+]]:
; CHECK-NEXT: movl $2, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test1_after_f2:.+]]:
invoke void @f(i32 2)
to label %finally.clone unwind label %exn.dispatch
@@ -69,6 +71,7 @@ catch1:
; CHECK: [[test1_before_f3:.+]]:
; CHECK-NEXT: movl $3, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test1_after_f3:.+]]:
invoke void @f(i32 3) [ "funclet"(token %catch.pad1) ]
to label %catch1.ret unwind label %finally
@@ -92,6 +95,7 @@ catch2:
; CHECK: [[test1_before_f4:.+]]:
; CHECK-NEXT: movl $4, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test1_after_f4:.+]]:
invoke void @f(i32 4) [ "funclet"(token %catch.pad2) ]
to label %try_in_catch unwind label %finally
@@ -100,6 +104,7 @@ try_in_catch:
; CHECK: [[test1_before_f5:.+]]:
; CHECK-NEXT: movl $5, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test1_after_f5:.+]]:
invoke void @f(i32 5) [ "funclet"(token %catch.pad2) ]
to label %catch2.ret unwind label %fault
@@ -116,6 +121,7 @@ fault:
; CHECK: [[test1_before_f6:.+]]:
; CHECK-NEXT: movl $6, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test1_after_f6:.+]]:
invoke void @f(i32 6) [ "funclet"(token %fault.pad) ]
to label %fault.ret unwind label %finally
@@ -312,6 +318,7 @@ unreachable:
; CHECK: [[test2_before_f1:.+]]:
; CHECK-NEXT: movl $1, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test2_after_f1:.+]]:
; CHECK: .seh_proc [[test2_catch1:[^ ]+]]
; CHECK: .seh_proc [[test2_catch2:[^ ]+]]
@@ -320,6 +327,7 @@ unreachable:
; CHECK: [[test2_before_f2:.+]]:
; CHECK-NEXT: movl $2, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test2_after_f2:.+]]:
; CHECK: int3
; CHECK: [[test2_end:.*func_end.*]]:
@@ -448,6 +456,7 @@ entry:
; CHECK: [[test3_before_f1:.+]]:
; CHECK-NEXT: movl $1, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test3_after_f1:.+]]:
invoke void @f(i32 1)
to label %exit unwind label %fault1
@@ -474,6 +483,7 @@ fault4:
; CHECK: [[test3_before_f6:.+]]:
; CHECK-NEXT: movl $6, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test3_after_f6:.+]]:
invoke void @f(i32 6) ["funclet"(token %fault.pad4)]
to label %fault4.cont unwind label %exn.dispatch1
@@ -482,6 +492,7 @@ fault4.cont:
; CHECK: [[test3_before_f7:.+]]:
; CHECK-NEXT: movl $7, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test3_after_f7:.+]]:
invoke void @f(i32 7) ["funclet"(token %fault.pad4)]
to label %unreachable unwind label %fault5
@@ -512,6 +523,7 @@ unreachable:
; CHECK: [[test3_before_f4:.+]]:
; CHECK-NEXT: movl $4, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test3_after_f4:.+]]:
; CHECK: int3
; CHECK: .seh_proc [[test3_fault2:[^ ]+]]
@@ -520,6 +532,7 @@ unreachable:
; CHECK: [[test3_before_f3:.+]]:
; CHECK-NEXT: movl $3, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test3_after_f3:.+]]:
; CHECK: int3
; CHECK: .seh_proc [[test3_fault1:[^ ]+]]
@@ -528,6 +541,7 @@ unreachable:
; CHECK: [[test3_before_f2:.+]]:
; CHECK-NEXT: movl $2, %ecx
; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
; CHECK-NEXT: [[test3_after_f2:.+]]:
; CHECK: int3
; CHECK: [[test3_end:.*func_end.*]]:
diff --git a/llvm/test/CodeGen/XCore/exception.ll b/llvm/test/CodeGen/XCore/exception.ll
index f222297..bb5f3f4 100644
--- a/llvm/test/CodeGen/XCore/exception.ll
+++ b/llvm/test/CodeGen/XCore/exception.ll
@@ -60,7 +60,7 @@ entry:
; CHECK: [[PRE_G:.L[a-zA-Z0-9_]+]]
; CHECK: bl g
; CHECK: [[POST_G:.L[a-zA-Z0-9_]+]]
-; CHECK: [[RETURN:.L[a-zA-Z0-9_]+]]
+; CHECK: [[RETURN:^.L[a-zA-Z0-9_]+]]
; CHECK: ldw r6, sp[1]
; CHECK: ldw r5, sp[2]
; CHECK: ldw r4, sp[3]
diff --git a/llvm/test/DebugInfo/Generic/mixed-source.ll b/llvm/test/DebugInfo/Generic/mixed-source.ll
index d5586f8..ee3598f 100644
--- a/llvm/test/DebugInfo/Generic/mixed-source.ll
+++ b/llvm/test/DebugInfo/Generic/mixed-source.ll
@@ -5,36 +5,66 @@
; CHECK: include_directories[ 0] = "dir"
; CHECK-NEXT: file_names[ 0]:
+; CHECK-NEXT: name: "main.c"
+; CHECK-NEXT: dir_index: 0
+; CHECK-NOT: source:
+; CHECK-NEXT: file_names[ 1]:
; CHECK-NEXT: name: "foo.c"
; CHECK-NEXT: dir_index: 0
; CHECK-NEXT: source: "void foo() { }\n"
-; CHECK-NEXT: file_names[ 1]:
-; CHECK-NEXT: name: "bar.h"
+; CHECK-NEXT: file_names[ 2]:
+; CHECK-NEXT: name: "newline.h"
+; CHECK-NEXT: dir_index: 0
+; CHECK-NEXT: source: "\n"
+; CHECK-NEXT: file_names[ 3]:
+; CHECK-NEXT: name: "empty.h"
+; CHECK-NEXT: dir_index: 0
+; CHECK-NEXT: source: "\n"
+; CHECK-NEXT: file_names[ 4]:
+; CHECK-NEXT: name: "absent.h"
; CHECK-NEXT: dir_index: 0
; CHECK-NOT: source:
; Test that DIFiles mixing source and no-source within a DICompileUnit works.
-define dso_local void @foo() !dbg !5 {
+define dso_local void @foo() !dbg !6 {
ret void, !dbg !7
}
-define dso_local void @bar() !dbg !6 {
- ret void, !dbg !8
+define dso_local void @newline() !dbg !9 {
+ ret void, !dbg !10
}
-!llvm.dbg.cu = !{!4}
+define dso_local void @empty() !dbg !12 {
+ ret void, !dbg !13
+}
+
+define dso_local void @absent() !dbg !15 {
+ ret void, !dbg !16
+}
+
+!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!0, !1}
!0 = !{i32 2, !"Dwarf Version", i32 5}
!1 = !{i32 2, !"Debug Info Version", i32 3}
-!2 = !DIFile(filename: "foo.c", directory: "dir", source: "void foo() { }\0A")
-!3 = !DIFile(filename: "bar.h", directory: "dir")
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, emissionKind: FullDebug, file: !4)
+!3 = !DISubroutineType(types: !{})
+!4 = !DIFile(filename: "main.c", directory: "dir")
+
+!5 = !DIFile(filename: "foo.c", directory: "dir", source: "void foo() { }\0A")
+!6 = distinct !DISubprogram(name: "foo", file: !5, line: 1, type: !3, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !2)
+!7 = !DILocation(line: 1, scope: !6)
+
+!8 = !DIFile(filename: "newline.h", directory: "dir", source: "\0A")
+!9 = distinct !DISubprogram(name: "newline", file: !8, line: 1, type: !3, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !2)
+!10 = !DILocation(line: 1, scope: !9)
+
+!11 = !DIFile(filename: "empty.h", directory: "dir", source: "")
+!12 = distinct !DISubprogram(name: "empty", file: !11, line: 1, type: !3, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !2)
+!13 = !DILocation(line: 1, scope: !12)
-!4 = distinct !DICompileUnit(language: DW_LANG_C99, emissionKind: FullDebug, file: !2)
-!5 = distinct !DISubprogram(name: "foo", file: !2, line: 1, type: !9, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !4)
-!6 = distinct !DISubprogram(name: "bar", file: !3, line: 1, type: !9, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !4)
-!7 = !DILocation(line: 1, scope: !5)
-!8 = !DILocation(line: 1, scope: !6)
-!9 = !DISubroutineType(types: !{})
+!14 = !DIFile(filename: "absent.h", directory: "dir")
+!15 = distinct !DISubprogram(name: "absent", file: !14, line: 1, type: !3, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !2)
+!16 = !DILocation(line: 1, scope: !15)
diff --git a/llvm/test/ExecutionEngine/RuntimeDyld/LoongArch/ELF_LoongArch_relocations.s b/llvm/test/ExecutionEngine/RuntimeDyld/LoongArch/ELF_LoongArch_relocations.s
index 0fca88b..ddbf02c 100644
--- a/llvm/test/ExecutionEngine/RuntimeDyld/LoongArch/ELF_LoongArch_relocations.s
+++ b/llvm/test/ExecutionEngine/RuntimeDyld/LoongArch/ELF_LoongArch_relocations.s
@@ -2,6 +2,9 @@
# RUN: llvm-mc --triple=loongarch64 --filetype=obj -o %t/reloc.o %s
# RUN: llvm-rtdyld --triple=loongarch64 --verify --check=%s %t/reloc.o \
# RUN: --map-section reloc.o,.got=0x21f00 \
+# RUN: --map-section reloc.o,.sec.large.pc=0x0000000012345000 \
+# RUN: --map-section reloc.o,.sec.large.got=0x44433333abcde000 \
+# RUN: --map-section reloc.o,.sec.dummy=0x4443333334567111 \
# RUN: --dummy-extern abs=0x0123456789abcdef \
# RUN: --dummy-extern external_data=0x1234
@@ -100,3 +103,42 @@ named_data:
.quad 0x2222222222222222
.quad 0x3333333333333333
.size named_data, .-named_data
+
+ .section .sec.large.pc,"ax"
+ .globl test_large_pc
+test_large_pc:
+## Code after link should be:
+## 1a44444d pcalau12i $t1, 139810
+## 02c4440c addi.d $t0, $zero, 273
+## 1666666c lu32i.d $t0, 209715
+## 0311118c lu52i.d $t0, $t0, 1092
+
+# rtdyld-check: *{4}(test_large_pc) = 0x1a44444d
+ pcalau12i $t1, %pc_hi20(.sec.dummy)
+# rtdyld-check: *{4}(test_large_pc + 4) = 0x02c4440c
+ addi.d $t0, $zero, %pc_lo12(.sec.dummy)
+# rtdyld-check: *{4}(test_large_pc + 8) = 0x1666666c
+ lu32i.d $t0, %pc64_lo20(.sec.dummy)
+# rtdyld-check: *{4}(test_large_pc + 12) = 0x0311118c
+ lu52i.d $t0, $t0, %pc64_hi12(.sec.dummy)
+
+ .section .sec.large.got,"ax"
+ .globl test_large_got
+test_large_got:
+## Code after link should be:
+## 1aa8688d pcalau12i $t1, 344900
+## 02fc000c addi.d $t0, $zero, -256
+## 1799996c lu32i.d $t0, -209717
+## 032eed8c lu52i.d $t0, $t0, -1093
+
+# rtdyld-check: *{4}(test_large_got) = 0x1aa8688d
+ pcalau12i $t1, %got_pc_hi20(external_data)
+# rtdyld-check: *{4}(test_large_got + 4) = 0x02fc000c
+ addi.d $t0, $zero, %got_pc_lo12(external_data)
+# rtdyld-check: *{4}(test_large_got + 8) = 0x1799996c
+ lu32i.d $t0, %got64_pc_lo20(external_data)
+# rtdyld-check: *{4}(test_large_got + 12) = 0x032eed8c
+ lu52i.d $t0, $t0, %got64_pc_hi12(external_data)
+
+ .section .sec.dummy,"a"
+ .word 0
diff --git a/llvm/test/FileCheck/long-check.txt b/llvm/test/FileCheck/long-check.txt
new file mode 100644
index 0000000..33bebfa
--- /dev/null
+++ b/llvm/test/FileCheck/long-check.txt
@@ -0,0 +1,9 @@
+// RUN: %ProtectFileCheckOutput not FileCheck --color=0 -input-file %s %s 2>&1 \
+// RUN: | FileCheck --check-prefix=ERROR --implicit-check-not={{error}}: %s
+
+ aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa
+CHECK: aaaaaaaaa{{a}} aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaah!
+
+ERROR: {{error}}: CHECK: expected string not found in input
+ERROR: {{error}}: no match found
+ERROR-NOT: {{note}}: possible intended match here \ No newline at end of file
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
index e9c1075..ae8b2b3 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
@@ -23,7 +23,7 @@ declare i32 @dummyPersonality(...)
define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr @__CxxFrameHandler3 {
; CHECK-INLINE-LABEL: define void @FuncletPersonality(
-; CHECK-INLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR4:[0-9]+]] personality ptr @__CxxFrameHandler3 {
+; CHECK-INLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @__CxxFrameHandler3 {
; CHECK-INLINE-NEXT: entry:
; CHECK-INLINE-NEXT: [[TMP0:%.*]] = alloca i64, align 32
; CHECK-INLINE-NEXT: store i64 0, ptr [[TMP0]], align 8
@@ -87,7 +87,6 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: call void @__asan_set_shadow_f3(i64 [[TMP38]], i64 1)
; CHECK-INLINE-NEXT: [[TMP39:%.*]] = add i64 [[TMP29]], 1066
; CHECK-INLINE-NEXT: call void @__asan_set_shadow_04(i64 [[TMP39]], i64 1)
-; CHECK-INLINE-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP22]])
; CHECK-INLINE-NEXT: [[TMP40:%.*]] = lshr i64 [[TMP21]], 3
; CHECK-INLINE-NEXT: [[TMP41:%.*]] = add i64 [[TMP40]], [[TMP1]]
; CHECK-INLINE-NEXT: [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
@@ -100,13 +99,12 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: [[TMP48:%.*]] = icmp sge i8 [[TMP47]], [[TMP43]]
; CHECK-INLINE-NEXT: br i1 [[TMP48]], label [[TMP49:%.*]], label [[TMP50]]
; CHECK-INLINE: 49:
-; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP21]]) #[[ATTR8:[0-9]+]]
+; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP21]]) #[[ATTR7:[0-9]+]]
; CHECK-INLINE-NEXT: unreachable
; CHECK-INLINE: 50:
; CHECK-INLINE-NEXT: store volatile i8 0, ptr [[TMP22]], align 1
; CHECK-INLINE-NEXT: [[TMP51:%.*]] = add i64 [[TMP29]], 1066
; CHECK-INLINE-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP51]], i64 1)
-; CHECK-INLINE-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP22]])
; CHECK-INLINE-NEXT: [[TMP52:%.*]] = alloca i8, i64 96, align 32
; CHECK-INLINE-NEXT: [[TMP53:%.*]] = ptrtoint ptr [[TMP52]] to i64
; CHECK-INLINE-NEXT: [[TMP54:%.*]] = add i64 [[TMP53]], 32
@@ -128,7 +126,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: [[TMP66:%.*]] = icmp ne i8 [[TMP65]], 0
; CHECK-INLINE-NEXT: br i1 [[TMP66]], label [[TMP67:%.*]], label [[TMP68:%.*]]
; CHECK-INLINE: 67:
-; CHECK-INLINE-NEXT: call void @__asan_report_store8(i64 [[TMP59]]) #[[ATTR8]]
+; CHECK-INLINE-NEXT: call void @__asan_report_store8(i64 [[TMP59]]) #[[ATTR7]]
; CHECK-INLINE-NEXT: unreachable
; CHECK-INLINE: 68:
; CHECK-INLINE-NEXT: store volatile i64 0, ptr [[TMP61]], align 8
@@ -158,7 +156,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: [[TMP88:%.*]] = icmp sge i8 [[TMP87]], [[TMP83]]
; CHECK-INLINE-NEXT: br i1 [[TMP88]], label [[TMP89:%.*]], label [[TMP90]]
; CHECK-INLINE: 89:
-; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP77]]) #[[ATTR8]]
+; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP77]]) #[[ATTR7]]
; CHECK-INLINE-NEXT: unreachable
; CHECK-INLINE: 90:
; CHECK-INLINE-NEXT: store volatile i8 0, ptr [[TMP79]], align 1
@@ -185,7 +183,6 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE: ehcleanup:
; CHECK-INLINE-NEXT: [[TMP98:%.*]] = cleanuppad within none []
; CHECK-INLINE-NEXT: call void @__asan_unpoison_stack_memory(i64 [[TMP54]], i64 4) [ "funclet"(token [[TMP98]]) ]
-; CHECK-INLINE-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP56]])
; CHECK-INLINE-NEXT: [[TMP99:%.*]] = lshr i64 [[TMP54]], 3
; CHECK-INLINE-NEXT: [[TMP100:%.*]] = add i64 [[TMP99]], [[TMP1]]
; CHECK-INLINE-NEXT: [[TMP101:%.*]] = inttoptr i64 [[TMP100]] to ptr
@@ -198,12 +195,11 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: [[TMP107:%.*]] = icmp sge i8 [[TMP106]], [[TMP102]]
; CHECK-INLINE-NEXT: br i1 [[TMP107]], label [[TMP108:%.*]], label [[TMP109]]
; CHECK-INLINE: 108:
-; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP54]]) #[[ATTR8]] [ "funclet"(token [[TMP98]]) ]
+; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP54]]) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: unreachable
; CHECK-INLINE: 109:
; CHECK-INLINE-NEXT: store volatile i8 0, ptr [[TMP56]], align 1
; CHECK-INLINE-NEXT: call void @__asan_poison_stack_memory(i64 [[TMP54]], i64 4) [ "funclet"(token [[TMP98]]) ]
-; CHECK-INLINE-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP56]])
; CHECK-INLINE-NEXT: call void @DeInit(ptr [[TMP14]]) [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: [[TMP110:%.*]] = call ptr @__asan_memset(ptr [[TMP16]], i32 0, i64 4) [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: [[TMP111:%.*]] = call ptr @__asan_memcpy(ptr [[TMP18]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP98]]) ]
@@ -226,7 +222,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: [[TMP125:%.*]] = icmp sge i8 [[TMP124]], [[TMP120]]
; CHECK-INLINE-NEXT: br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127]]
; CHECK-INLINE: 126:
-; CHECK-INLINE-NEXT: call void @__asan_report_store_n(i64 [[TMP116]], i64 8) #[[ATTR8]] [ "funclet"(token [[TMP98]]) ]
+; CHECK-INLINE-NEXT: call void @__asan_report_store_n(i64 [[TMP116]], i64 8) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: unreachable
; CHECK-INLINE: 127:
; CHECK-INLINE-NEXT: [[TMP128:%.*]] = lshr i64 [[TMP114]], 3
@@ -241,7 +237,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: [[TMP136:%.*]] = icmp sge i8 [[TMP135]], [[TMP131]]
; CHECK-INLINE-NEXT: br i1 [[TMP136]], label [[TMP137:%.*]], label [[EHEXIT]]
; CHECK-INLINE: 137:
-; CHECK-INLINE-NEXT: call void @__asan_report_store_n(i64 [[TMP114]], i64 8) #[[ATTR8]] [ "funclet"(token [[TMP98]]) ]
+; CHECK-INLINE-NEXT: call void @__asan_report_store_n(i64 [[TMP114]], i64 8) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: unreachable
; CHECK-INLINE: ehexit:
; CHECK-INLINE-NEXT: store i64 0, ptr [[PTRPARAM]], align 1
@@ -265,7 +261,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: cleanupret from [[TMP98]] unwind to caller
;
; CHECK-OUTLINE-LABEL: define void @FuncletPersonality(
-; CHECK-OUTLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR4:[0-9]+]] personality ptr @__CxxFrameHandler3 {
+; CHECK-OUTLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @__CxxFrameHandler3 {
; CHECK-OUTLINE-NEXT: entry:
; CHECK-OUTLINE-NEXT: [[TMP0:%.*]] = alloca i64, align 32
; CHECK-OUTLINE-NEXT: store i64 0, ptr [[TMP0]], align 8
@@ -339,12 +335,10 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_f3(i64 [[TMP45]], i64 5)
; CHECK-OUTLINE-NEXT: [[TMP46:%.*]] = add i64 [[TMP33]], 1066
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_04(i64 [[TMP46]], i64 1)
-; CHECK-OUTLINE-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP22]])
; CHECK-OUTLINE-NEXT: call void @__asan_store1(i64 [[TMP21]])
; CHECK-OUTLINE-NEXT: store volatile i8 0, ptr [[TMP22]], align 1
; CHECK-OUTLINE-NEXT: [[TMP47:%.*]] = add i64 [[TMP33]], 1066
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP47]], i64 1)
-; CHECK-OUTLINE-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP22]])
; CHECK-OUTLINE-NEXT: call void @__asan_store8(i64 [[TMP25]])
; CHECK-OUTLINE-NEXT: store volatile i64 0, ptr [[TMP26]], align 8
; CHECK-OUTLINE-NEXT: [[TMPCOPYI64:%.*]] = load i64, ptr [[TMP26]], align 8
@@ -389,12 +383,10 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-OUTLINE-NEXT: [[TMP67:%.*]] = cleanuppad within none []
; CHECK-OUTLINE-NEXT: [[TMP68:%.*]] = add i64 [[TMP33]], 1068
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_04(i64 [[TMP68]], i64 1) [ "funclet"(token [[TMP67]]) ]
-; CHECK-OUTLINE-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP24]])
; CHECK-OUTLINE-NEXT: call void @__asan_store1(i64 [[TMP23]]) [ "funclet"(token [[TMP67]]) ]
; CHECK-OUTLINE-NEXT: store volatile i8 0, ptr [[TMP24]], align 1
; CHECK-OUTLINE-NEXT: [[TMP69:%.*]] = add i64 [[TMP33]], 1068
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP69]], i64 1) [ "funclet"(token [[TMP67]]) ]
-; CHECK-OUTLINE-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP24]])
; CHECK-OUTLINE-NEXT: call void @DeInit(ptr [[TMP14]]) [ "funclet"(token [[TMP67]]) ]
; CHECK-OUTLINE-NEXT: [[TMP70:%.*]] = call ptr @__asan_memset(ptr [[TMP16]], i32 0, i64 4) [ "funclet"(token [[TMP67]]) ]
; CHECK-OUTLINE-NEXT: [[TMP71:%.*]] = call ptr @__asan_memcpy(ptr [[TMP18]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP67]]) ]
@@ -495,7 +487,7 @@ nopredecessor:
; Non-Windows personality, ensure no funclet gets attached to asan runtime call.
define void @OtherPersonality(ptr %ptrParam) sanitize_address personality ptr @dummyPersonality {
; CHECK-LABEL: define void @OtherPersonality(
-; CHECK-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR4:[0-9]+]] personality ptr @dummyPersonality {
+; CHECK-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @dummyPersonality {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8
; CHECK-NEXT: [[ASAN_LOCAL_STACK_BASE:%.*]] = alloca i64, align 8
diff --git a/llvm/test/Instrumentation/AddressSanitizer/lifetime-throw.ll b/llvm/test/Instrumentation/AddressSanitizer/lifetime-throw.ll
index eac414a9..ddfa5e1 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/lifetime-throw.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/lifetime-throw.ll
@@ -24,7 +24,7 @@ entry:
call void @llvm.lifetime.start.p0(i64 4, ptr %x)
; CHECK: store i8 4, ptr %{{[0-9]+}}
- ; CHECK-NEXT: @llvm.lifetime.start
+ ; CHECK-NOT: @llvm.lifetime.start
%exception = call ptr @__cxa_allocate_exception(i64 4)
invoke void @__cxa_throw(ptr %exception, ptr @_ZTI3ABC, ptr @_ZN3ABCD2Ev) noreturn
@@ -38,7 +38,7 @@ lpad:
call void @_ZN3ABCD2Ev(ptr nonnull %x)
call void @llvm.lifetime.end.p0(i64 4, ptr %x)
; CHECK: store i8 -8, ptr %{{[0-9]+}}
- ; CHECK-NEXT: @llvm.lifetime.end
+ ; CHECK-NOT: @llvm.lifetime.end
resume { ptr, i32 } %0
; CHECK: store i64 0, ptr %{{[0-9]+}}
@@ -77,7 +77,7 @@ entry:
call void @llvm.lifetime.start.p0(i64 4, ptr %x)
; CHECK: store i8 4, ptr %{{[0-9]+}}
- ; CHECK-NEXT: @llvm.lifetime.start
+ ; CHECK-NOT: @llvm.lifetime.start
invoke void @_CxxThrowException(ptr %tmp, ptr nonnull @"_TI1?AUABC@@") noreturn
to label %unreachable unwind label %ehcleanup
@@ -89,7 +89,7 @@ ehcleanup:
call void @"\01??1ABC@@QEAA@XZ"(ptr nonnull %x) [ "funclet"(token %0) ]
call void @llvm.lifetime.end.p0(i64 4, ptr %x)
; CHECK: store i8 -8, ptr %{{[0-9]+}}
- ; CHECK-NEXT: @llvm.lifetime.end
+ ; CHECK-NOT: @llvm.lifetime.end
cleanupret from %0 unwind to caller
; CHECK: store i64 0, ptr %{{[0-9]+}}
diff --git a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
index a878dbe..bbfe00b 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
@@ -30,7 +30,6 @@ define void @lifetime_no_size(i64 %i) sanitize_address {
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 0
; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
; CHECK-NEXT: store i64 -868083117767659023, ptr [[TMP11]], align 1
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr [[TMP2]])
; CHECK-NEXT: [[AI:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP2]], i64 0, i64 [[I]]
; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[AI]] to i64
; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP12]], 3
@@ -49,7 +48,6 @@ define void @lifetime_no_size(i64 %i) sanitize_address {
; CHECK-NEXT: unreachable
; CHECK: [[BB23]]:
; CHECK-NEXT: store volatile i8 0, ptr [[AI]], align 4
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[TMP2]])
; CHECK-NEXT: store i64 1172321806, ptr [[TMP3]], align 8
; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[TMP9]], 0
; CHECK-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
@@ -100,7 +98,6 @@ define void @lifetime() sanitize_address {
; CHECK-DEFAULT-NEXT: [[TMP14:%.*]] = add i64 [[TMP11]], 4
; CHECK-DEFAULT-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
; CHECK-DEFAULT-NEXT: store i8 4, ptr [[TMP15]], align 1
-; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr [[TMP4]])
; CHECK-DEFAULT-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[TMP4]] to i64
; CHECK-DEFAULT-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP16]], 3
; CHECK-DEFAULT-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 2147450880
@@ -121,11 +118,9 @@ define void @lifetime() sanitize_address {
; CHECK-DEFAULT-NEXT: [[TMP28:%.*]] = add i64 [[TMP11]], 4
; CHECK-DEFAULT-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
; CHECK-DEFAULT-NEXT: store i8 -8, ptr [[TMP29]], align 1
-; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP4]])
; CHECK-DEFAULT-NEXT: [[TMP30:%.*]] = add i64 [[TMP11]], 4
; CHECK-DEFAULT-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
; CHECK-DEFAULT-NEXT: store i8 -8, ptr [[TMP31]], align 1
-; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr [[TMP4]])
; CHECK-DEFAULT-NEXT: [[TMP32:%.*]] = alloca i8, i64 128, align 32
; CHECK-DEFAULT-NEXT: [[TMP33:%.*]] = ptrtoint ptr [[TMP32]] to i64
; CHECK-DEFAULT-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 32
@@ -135,7 +130,6 @@ define void @lifetime() sanitize_address {
; CHECK-DEFAULT-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP34]] to ptr
; CHECK-DEFAULT-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[TMP36]] to i64
; CHECK-DEFAULT-NEXT: call void @__asan_unpoison_stack_memory(i64 [[TMP37]], i64 40)
-; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr [[TMP36]])
; CHECK-DEFAULT-NEXT: [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64
; CHECK-DEFAULT-NEXT: [[TMP39:%.*]] = lshr i64 [[TMP38]], 3
; CHECK-DEFAULT-NEXT: [[TMP40:%.*]] = add i64 [[TMP39]], 2147450880
@@ -155,11 +149,9 @@ define void @lifetime() sanitize_address {
; CHECK-DEFAULT-NEXT: store volatile i8 0, ptr [[TMP36]], align 1
; CHECK-DEFAULT-NEXT: [[TMP50:%.*]] = ptrtoint ptr [[TMP36]] to i64
; CHECK-DEFAULT-NEXT: call void @__asan_poison_stack_memory(i64 [[TMP50]], i64 40)
-; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr [[TMP36]])
; CHECK-DEFAULT-NEXT: [[TMP51:%.*]] = add i64 [[TMP11]], 4
; CHECK-DEFAULT-NEXT: [[TMP52:%.*]] = inttoptr i64 [[TMP51]] to ptr
; CHECK-DEFAULT-NEXT: store i8 4, ptr [[TMP52]], align 1
-; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[TMP4]])
; CHECK-DEFAULT-NEXT: [[TMP53:%.*]] = ptrtoint ptr [[TMP4]] to i64
; CHECK-DEFAULT-NEXT: [[TMP54:%.*]] = lshr i64 [[TMP53]], 3
; CHECK-DEFAULT-NEXT: [[TMP55:%.*]] = add i64 [[TMP54]], 2147450880
@@ -180,7 +172,6 @@ define void @lifetime() sanitize_address {
; CHECK-DEFAULT-NEXT: [[TMP65:%.*]] = add i64 [[TMP11]], 4
; CHECK-DEFAULT-NEXT: [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
; CHECK-DEFAULT-NEXT: store i8 -8, ptr [[TMP66]], align 1
-; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP4]])
; CHECK-DEFAULT-NEXT: [[TMP67:%.*]] = ptrtoint ptr [[TMP1]] to i64
; CHECK-DEFAULT-NEXT: [[TMP68:%.*]] = load i64, ptr [[TMP1]], align 8
; CHECK-DEFAULT-NEXT: call void @__asan_allocas_unpoison(i64 [[TMP68]], i64 [[TMP67]])
@@ -212,7 +203,6 @@ define void @lifetime() sanitize_address {
; CHECK-NO-DYNAMIC-NEXT: [[TMP13:%.*]] = add i64 [[TMP10]], 4
; CHECK-NO-DYNAMIC-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
; CHECK-NO-DYNAMIC-NEXT: store i8 4, ptr [[TMP14]], align 1
-; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr [[TMP3]])
; CHECK-NO-DYNAMIC-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP3]] to i64
; CHECK-NO-DYNAMIC-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP15]], 3
; CHECK-NO-DYNAMIC-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 2147450880
@@ -233,11 +223,9 @@ define void @lifetime() sanitize_address {
; CHECK-NO-DYNAMIC-NEXT: [[TMP27:%.*]] = add i64 [[TMP10]], 4
; CHECK-NO-DYNAMIC-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
; CHECK-NO-DYNAMIC-NEXT: store i8 -8, ptr [[TMP28]], align 1
-; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP3]])
; CHECK-NO-DYNAMIC-NEXT: [[TMP29:%.*]] = add i64 [[TMP10]], 4
; CHECK-NO-DYNAMIC-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
; CHECK-NO-DYNAMIC-NEXT: store i8 -8, ptr [[TMP30]], align 1
-; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr [[TMP3]])
; CHECK-NO-DYNAMIC-NEXT: [[ARR:%.*]] = alloca [10 x i32], align 16
; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr [[ARR]])
; CHECK-NO-DYNAMIC-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[ARR]] to i64
@@ -261,7 +249,6 @@ define void @lifetime() sanitize_address {
; CHECK-NO-DYNAMIC-NEXT: [[TMP43:%.*]] = add i64 [[TMP10]], 4
; CHECK-NO-DYNAMIC-NEXT: [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr
; CHECK-NO-DYNAMIC-NEXT: store i8 4, ptr [[TMP44]], align 1
-; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[TMP3]])
; CHECK-NO-DYNAMIC-NEXT: [[TMP45:%.*]] = ptrtoint ptr [[TMP3]] to i64
; CHECK-NO-DYNAMIC-NEXT: [[TMP46:%.*]] = lshr i64 [[TMP45]], 3
; CHECK-NO-DYNAMIC-NEXT: [[TMP47:%.*]] = add i64 [[TMP46]], 2147450880
@@ -282,7 +269,6 @@ define void @lifetime() sanitize_address {
; CHECK-NO-DYNAMIC-NEXT: [[TMP57:%.*]] = add i64 [[TMP10]], 4
; CHECK-NO-DYNAMIC-NEXT: [[TMP58:%.*]] = inttoptr i64 [[TMP57]] to ptr
; CHECK-NO-DYNAMIC-NEXT: store i8 -8, ptr [[TMP58]], align 1
-; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP3]])
; CHECK-NO-DYNAMIC-NEXT: store i64 1172321806, ptr [[TMP4]], align 8
; CHECK-NO-DYNAMIC-NEXT: [[TMP59:%.*]] = add i64 [[TMP10]], 0
; CHECK-NO-DYNAMIC-NEXT: [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
@@ -325,166 +311,6 @@ define void @lifetime() sanitize_address {
ret void
}
-; Check that arguments of lifetime may come from phi nodes.
-define void @phi_args(i1 %x) sanitize_address {
-; CHECK-LABEL: define void @phi_args(
-; CHECK-SAME: i1 [[X:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
-; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 32
-; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; CHECK-NEXT: store i64 1102416563, ptr [[TMP3]], align 8
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 8
-; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.2 to i64), ptr [[TMP5]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 16
-; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT: store i64 ptrtoint (ptr @phi_args to i64), ptr [[TMP7]], align 8
-; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP0]], 3
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 2147450880
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 0
-; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-; CHECK-NEXT: store i64 -868082052615769615, ptr [[TMP11]], align 1
-; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP9]], 4
-; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT: store i8 0, ptr [[TMP13]], align 1
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[TMP2]])
-; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP2]] to i64
-; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 3
-; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 2147450880
-; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
-; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP17]], align 1
-; CHECK-NEXT: [[TMP19:%.*]] = icmp ne i8 [[TMP18]], 0
-; CHECK-NEXT: br i1 [[TMP19]], label %[[BB20:.*]], label %[[BB25:.*]], !prof [[PROF1]]
-; CHECK: [[BB20]]:
-; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP14]], 7
-; CHECK-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i8
-; CHECK-NEXT: [[TMP23:%.*]] = icmp sge i8 [[TMP22]], [[TMP18]]
-; CHECK-NEXT: br i1 [[TMP23]], label %[[BB24:.*]], label %[[BB25]]
-; CHECK: [[BB24]]:
-; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP14]]) #[[ATTR4]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB25]]:
-; CHECK-NEXT: store volatile i8 0, ptr [[TMP2]], align 1
-; CHECK-NEXT: br i1 [[X]], label %[[BB0:.*]], label %[[BB1:.*]]
-; CHECK: [[BB0]]:
-; CHECK-NEXT: br label %[[BB1]]
-; CHECK: [[BB1]]:
-; CHECK-NEXT: [[I_PHI:%.*]] = phi ptr [ [[TMP2]], %[[BB25]] ], [ [[TMP2]], %[[BB0]] ]
-; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP9]], 4
-; CHECK-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP26]] to ptr
-; CHECK-NEXT: store i8 -8, ptr [[TMP27]], align 1
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[I_PHI]])
-; CHECK-NEXT: store i64 1172321806, ptr [[TMP3]], align 8
-; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP9]], 0
-; CHECK-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-; CHECK-NEXT: store i64 0, ptr [[TMP29]], align 1
-; CHECK-NEXT: ret void
-;
-
-entry:
- %i = alloca i64, align 4
-
- ; Poison memory in prologue: F1F1F1F1F8F3F3F3
-
- call void @llvm.lifetime.start.p0(i64 8, ptr %i)
-
- store volatile i8 0, ptr %i
-
- br i1 %x, label %bb0, label %bb1
-
-bb0:
- br label %bb1
-
-bb1:
- %i.phi = phi ptr [ %i, %entry ], [ %i, %bb0 ]
- call void @llvm.lifetime.end.p0(i64 8, ptr %i.phi)
-
- ret void
-}
-
-; Check that arguments of lifetime may come from getelementptr nodes.
-define void @getelementptr_args(i64 %i) sanitize_address{
-; CHECK-LABEL: define void @getelementptr_args(
-; CHECK-SAME: i64 [[I:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 1216, align 32
-; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 32
-; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 1184
-; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; CHECK-NEXT: store i64 1102416563, ptr [[TMP5]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.3 to i64), ptr [[TMP7]], align 8
-; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP0]], 16
-; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT: store i64 ptrtoint (ptr @getelementptr_args to i64), ptr [[TMP9]], align 8
-; CHECK-NEXT: [[TMP10:%.*]] = lshr i64 [[TMP0]], 3
-; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 2147450880
-; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT: store i32 -235802127, ptr [[TMP13]], align 1
-; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP11]], 4
-; CHECK-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP14]], i64 128)
-; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP11]], 132
-; CHECK-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
-; CHECK-NEXT: store i64 -940422246894996750, ptr [[TMP16]], align 1
-; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP11]], 140
-; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-; CHECK-NEXT: store i64 -940422246894996750, ptr [[TMP18]], align 1
-; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP11]], 150
-; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-; CHECK-NEXT: store i16 -3085, ptr [[TMP20]], align 1
-; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP11]], 4
-; CHECK-NEXT: call void @__asan_set_shadow_00(i64 [[TMP21]], i64 128)
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1024, ptr [[TMP2]])
-; CHECK-NEXT: [[AI:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 [[I]]
-; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr [[AI]] to i64
-; CHECK-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 3
-; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[TMP23]], 2147450880
-; CHECK-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[TMP25]], align 1
-; CHECK-NEXT: [[TMP27:%.*]] = icmp ne i8 [[TMP26]], 0
-; CHECK-NEXT: br i1 [[TMP27]], label %[[BB28:.*]], label %[[BB29:.*]]
-; CHECK: [[BB28]]:
-; CHECK-NEXT: call void @__asan_report_store8(i64 [[TMP22]]) #[[ATTR4]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB29]]:
-; CHECK-NEXT: store ptr [[TMP2]], ptr [[AI]], align 8
-; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[TMP11]], 4
-; CHECK-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP30]], i64 128)
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1024, ptr [[TMP2]])
-; CHECK-NEXT: store i64 1172321806, ptr [[TMP5]], align 8
-; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[TMP11]], 0
-; CHECK-NEXT: call void @__asan_set_shadow_00(i64 [[TMP31]], i64 148)
-; CHECK-NEXT: [[TMP32:%.*]] = add i64 [[TMP11]], 150
-; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr
-; CHECK-NEXT: store i16 0, ptr [[TMP33]], align 1
-; CHECK-NEXT: ret void
-;
-entry:
- %x = alloca [1024 x i8], align 16
- %a = alloca [2 x ptr], align 8
-
- ; F1F1F1F1
- ; 0xf2f2f2f2f2f2f2f2
- ; 0xf2f2f2f2f2f2f2f2
-
- call void @llvm.lifetime.start.p0(i64 1024, ptr %x)
-
- %ai = getelementptr inbounds [2 x ptr], ptr %a, i64 0, i64 %i
- store ptr %x, ptr %ai, align 8
-
- call void @llvm.lifetime.end.p0(i64 1024, ptr %x)
-
- ret void
-}
-
define void @zero_sized(i64 %a) #0 {
; CHECK-LABEL: define void @zero_sized(
; CHECK-SAME: i64 [[A:%.*]]) {
diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll
index 9e21664..b4fe74a 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll
@@ -100,8 +100,6 @@ entry:
; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
; ENTRY-UAS-NEXT: store i8 2, ptr [[PTR]], align 1
- ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 650, ptr %xx)
-
call void @Foo(ptr %xx)
; CHECK-NEXT: call void @Foo(ptr %xx)
@@ -109,8 +107,6 @@ entry:
; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 4
; ENTRY-UAS-NEXT: call void @__asan_set_shadow_f8(i64 [[OFFSET]], i64 82)
- ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 650, ptr %xx)
-
call void @llvm.lifetime.start.p0(i64 13, ptr %yy)
; 0005
@@ -118,8 +114,6 @@ entry:
; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
; ENTRY-UAS-NEXT: store i16 5, ptr [[PTR]], align 1
- ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 13, ptr %yy)
-
call void @Foo(ptr %yy)
; CHECK-NEXT: call void @Foo(ptr %yy)
@@ -129,8 +123,6 @@ entry:
; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
; ENTRY-UAS-NEXT: store i16 -1800, ptr [[PTR]], align 1
- ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 13, ptr %yy)
-
call void @llvm.lifetime.start.p0(i64 40, ptr %zz)
; 00000000
@@ -142,8 +134,6 @@ entry:
; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
; ENTRY-UAS-NEXT: store i8 0, ptr [[PTR]], align 1
- ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr %zz)
-
call void @Foo(ptr %zz)
; CHECK-NEXT: call void @Foo(ptr %zz)
@@ -157,8 +147,6 @@ entry:
; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
; ENTRY-UAS-NEXT: store i8 -8, ptr [[PTR]], align 1
- ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr %zz)
-
; CHECK: {{^[0-9]+}}:
; CHECK-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 0
diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll
index 35833ed..fca92cb 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll
@@ -100,8 +100,6 @@ entry:
; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
; ENTRY-UAS-NEXT: store i8 2, ptr [[PTR]], align 1
- ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 650, ptr %xx)
-
call void @Foo(ptr %xx)
; CHECK-NEXT: call void @Foo(ptr %xx)
@@ -109,8 +107,6 @@ entry:
; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 4
; ENTRY-UAS-NEXT: call void @__asan_set_shadow_f8(i64 [[OFFSET]], i64 82)
- ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 650, ptr %xx)
-
call void @llvm.lifetime.start.p0(i64 13, ptr %yy)
; 0005
@@ -118,8 +114,6 @@ entry:
; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
; ENTRY-UAS-NEXT: store i16 1280, ptr [[PTR]], align 1
- ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 13, ptr %yy)
-
call void @Foo(ptr %yy)
; CHECK-NEXT: call void @Foo(ptr %yy)
@@ -129,8 +123,6 @@ entry:
; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
; ENTRY-UAS-NEXT: store i16 -1800, ptr [[PTR]], align 1
- ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 13, ptr %yy)
-
call void @llvm.lifetime.start.p0(i64 40, ptr %zz)
; 00000000
@@ -142,8 +134,6 @@ entry:
; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
; ENTRY-UAS-NEXT: store i8 0, ptr [[PTR]], align 1
- ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr %zz)
-
call void @Foo(ptr %zz)
; CHECK-NEXT: call void @Foo(ptr %zz)
@@ -157,8 +147,6 @@ entry:
; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
; ENTRY-UAS-NEXT: store i8 -8, ptr [[PTR]], align 1
- ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr %zz)
-
; CHECK: {{^[0-9]+}}:
; CHECK-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 0
@@ -209,40 +197,6 @@ entry:
; CHECK: ret void
}
-declare void @foo(ptr)
-define void @PR41481(i1 %b) sanitize_address {
-; CHECK-LABEL: @PR41481
-entry:
- %p1 = alloca i32
- %p2 = alloca i32
- br label %bb1
-
- ; Since we cannot account for all lifetime intrinsics in this function, we
- ; might have missed a lifetime.start one and therefore shouldn't poison the
- ; allocas at function entry.
- ; ENTRY: store i64 -935356719533264399
- ; ENTRY-UAS: store i64 -935356719533264399
-
-bb1:
- %p = select i1 %b, ptr %p1, ptr %p2
- %q = select i1 %b, ptr %p1, ptr %p2
- call void @llvm.lifetime.start.p0(i64 4, ptr %q)
- call void @foo(ptr %p)
- br i1 %b, label %bb2, label %bb3
-
-bb2:
- call void @llvm.lifetime.end.p0(i64 4, ptr %p1)
- br label %end
-
-bb3:
- call void @llvm.lifetime.end.p0(i64 4, ptr %p2)
- br label %end
-
-end:
- ret void
-}
-
-
declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/alloca.ll b/llvm/test/Instrumentation/MemorySanitizer/alloca.ll
index 25a44ec..40ade5f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/alloca.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/alloca.ll
@@ -176,78 +176,5 @@ entry:
; CHECK: call void @llvm.lifetime.end
; CHECK: ret void
-
-; If we can't trace one of the lifetime markers to a single alloca, fall back
-; to poisoning allocas at the beginning of the function.
-; Each alloca must be poisoned only once.
-define void @lifetime_no_alloca(i8 %v) sanitize_memory {
-entry:
- %x = alloca i32, align 4
- %y = alloca i32, align 4
- %z = alloca i32, align 4
- %tobool = icmp eq i8 %v, 0
- %xy = select i1 %tobool, ptr %x, ptr %y
- %cxcy = select i1 %tobool, ptr %x, ptr %y
- br label %another_bb
-
-another_bb:
- call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z)
- store i32 7, ptr %z
- call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z)
- call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z)
- store i32 7, ptr %z
- call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z)
- call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cxcy)
- store i32 8, ptr %xy
- call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cxcy)
- ret void
-}
-
-; CHECK-LABEL: define void @lifetime_no_alloca(
-; CHECK-LABEL: entry:
-; CHECK: %x = alloca i32
-; INLINE: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false)
-; CALL: call void @__msan_poison_stack(ptr {{.*}}, i64 4)
-; ORIGIN: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4,
-; ORIGIN-LEAN: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4,
-; KMSAN: call void @__msan_poison_alloca(ptr {{.*}}, i64 4,
-; CHECK: %y = alloca i32
-; INLINE: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false)
-; CALL: call void @__msan_poison_stack(ptr {{.*}}, i64 4)
-; ORIGIN: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4,
-; ORIGIN-LEAN: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4,
-; KMSAN: call void @__msan_poison_alloca(ptr {{.*}}, i64 4,
-; CHECK: %z = alloca i32
-; INLINE: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false)
-; CALL: call void @__msan_poison_stack(ptr {{.*}}, i64 4)
-; ORIGIN: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4,
-; ORIGIN-LEAN: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4,
-; KMSAN: call void @__msan_poison_alloca(ptr {{.*}}, i64 4,
-
-; There're two lifetime intrinsics for %z, but we must instrument it only once.
-; INLINE-NOT: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false)
-; CALL-NOT: call void @__msan_poison_stack(ptr {{.*}}, i64 4)
-; ORIGIN-NOT: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4,
-; ORIGIN-LEAN-NOT: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4,
-; KMSAN-NOT: call void @__msan_poison_alloca(ptr {{.*}}, i64 4,
-; CHECK-LABEL: another_bb:
-
-; CHECK: call void @llvm.lifetime.start
-; INLINE-NOT: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false)
-; CALL-NOT: call void @__msan_poison_stack(ptr {{.*}}, i64 4)
-; ORIGIN-NOT: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4,
-; ORIGIN-LEAN-NOT: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4,
-; KMSAN-NOT: call void @__msan_poison_alloca(ptr {{.*}}, i64 4,
-; CHECK: call void @llvm.lifetime.end
-; CHECK: call void @llvm.lifetime.start
-; INLINE-NOT: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false)
-; CALL-NOT: call void @__msan_poison_stack(ptr {{.*}}, i64 4)
-; ORIGIN-NOT: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4,
-; ORIGIN-LEAN-NOT: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4,
-; KMSAN-NOT: call void @__msan_poison_alloca(ptr {{.*}}, i64 4,
-; CHECK: call void @llvm.lifetime.end
-
-
-
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
index 6bb0f4b..3d6af6b 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
@@ -3628,6 +3628,18 @@ v_alignbit_b32 v5, v1, v2, exec_lo
v_alignbit_b32 v5, v1, v2, exec_hi
// GFX10: encoding: [0x05,0x00,0x4e,0xd5,0x01,0x05,0xfe,0x01]
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1]
+// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04]
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1]
+// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04]
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1]
+// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04]
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04]
+
v_alignbyte_b32 v5, v1, v2, v3
// GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04]
@@ -3715,6 +3727,18 @@ v_alignbyte_b32 v5, v1, v2, exec_lo
v_alignbyte_b32 v5, v1, v2, exec_hi
// GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0xfe,0x01]
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1]
+// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04]
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1]
+// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04]
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1]
+// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04]
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04]
+
v_mullit_f32 v5, v1, v2, v3
// GFX10: encoding: [0x05,0x00,0x50,0xd5,0x01,0x05,0x0e,0x04]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s b/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s
index 899c4c7..800f662 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s
@@ -12,3 +12,30 @@ s_buffer_load_i8 s5, s[4:7], s0 nv
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
// GFX12-ERR-NEXT:{{^}}s_buffer_load_i8 s5, s[4:7], s0 nv
// GFX12-ERR-NEXT:{{^}} ^
+
+s_load_b32 s4, s[2:3], 0xa scale_offset
+// GFX1250: s_load_b32 s4, s[2:3], 0xa scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x0a,0x00,0x00,0xf9]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], 0xa scale_offset
+// GFX12-ERR-NEXT:{{^}} ^
+
+s_load_b32 s4, s[2:3], 0xa scale_offset nv
+// GFX1250: s_load_b32 s4, s[2:3], 0xa scale_offset nv ; encoding: [0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf9]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], 0xa scale_offset nv
+// GFX12-ERR-NEXT:{{^}} ^
+// GFX12-ERR-NEXT: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], 0xa scale_offset nv
+// GFX12-ERR-NEXT:{{^}} ^
+
+s_load_b32 s4, s[2:3], s5 offset:32 scale_offset
+// GFX1250: s_load_b32 s4, s[2:3], s5 offset:0x20 scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0x0b]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], s5 offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}} ^
+
+s_load_b32 s4, s[2:3], m0 offset:32 scale_offset
+// GFX1250: s_load_b32 s4, s[2:3], m0 offset:0x20 scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0xfb]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], m0 offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}} ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_smem_err.s
new file mode 100644
index 0000000..e57d4fc76
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_smem_err.s
@@ -0,0 +1,16 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s
+
+s_buffer_load_i8 s5, s[4:7], s0 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}s_buffer_load_i8 s5, s[4:7], s0 scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
+
+s_prefetch_data s[18:19], 100, s10, 7 nv
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}s_prefetch_data s[18:19], 100, s10, 7 nv
+// GFX1250-ERR-NEXT:{{^}} ^
+
+s_prefetch_data s[18:19], 100, s10, 7 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}s_prefetch_data s[18:19], 100, s10, 7 scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf_err.s
new file mode 100644
index 0000000..731eb67
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf_err.s
@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s
+
+buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
index 488040e..b9eb2d2 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
@@ -61,6 +61,194 @@ scratch_load_b32 v5, v2, off nv
// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v5, v2, off nv
// GFX12-ERR-NEXT:{{^}} ^
+global_load_b32 v5, v1, s[2:3] offset:32 scale_offset
+// GFX1250: global_load_b32 v5, v1, s[2:3] offset:32 scale_offset ; encoding: [0x02,0x00,0x05,0xee,0x05,0x00,0x01,0x00,0x01,0x20,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}global_load_b32 v5, v1, s[2:3] offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}} ^
+
+global_store_b32 v5, v1, s[2:3] offset:32 scale_offset
+// GFX1250: global_store_b32 v5, v1, s[2:3] offset:32 scale_offset ; encoding: [0x02,0x80,0x06,0xee,0x00,0x00,0x81,0x00,0x05,0x20,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}global_store_b32 v5, v1, s[2:3] offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}} ^
+
+global_atomic_add_u32 v2, v5, s[2:3] scale_offset
+// GFX1250: global_atomic_add_u32 v2, v5, s[2:3] scale_offset ; encoding: [0x02,0x40,0x0d,0xee,0x00,0x00,0x81,0x02,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}global_atomic_add_u32 v2, v5, s[2:3] scale_offset
+// GFX12-ERR-NEXT:{{^}} ^
+
+scratch_load_b32 v5, v2, off scale_offset
+// GFX1250: scratch_load_b32 v5, v2, off scale_offset ; encoding: [0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v5, v2, off scale_offset
+// GFX12-ERR-NEXT:{{^}} ^
+
+scratch_load_b32 v5, v2, off offset:32 scale_offset
+// GFX1250: scratch_load_b32 v5, v2, off offset:32 scale_offset ; encoding: [0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v5, v2, off offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}} ^
+
+scratch_load_b32 v5, v2, s1 offset:32 scale_offset
+// GFX1250: scratch_load_b32 v5, v2, s1 offset:32 scale_offset ; encoding: [0x01,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v5, v2, s1 offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}} ^
+
+scratch_store_b32 v2, v5, off scale_offset
+// GFX1250: scratch_store_b32 v2, v5, off scale_offset ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_store_b32 v2, v5, off scale_offset
+// GFX12-ERR-NEXT:{{^}} ^
+
+scratch_store_b32 v2, v5, s1 scale_offset
+// GFX1250: scratch_store_b32 v2, v5, s1 scale_offset ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_store_b32 v2, v5, s1 scale_offset
+// GFX12-ERR-NEXT:{{^}} ^
+
+flat_prefetch_b8 v[2:3]
+// GFX1250: flat_prefetch_b8 v[2:3] ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_prefetch_b8 v[2:3] offset:1024 ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0xfc,0xff]
+// GFX1250: flat_prefetch_b8 v[2:3] offset:1024 ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x04,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_prefetch_b8 v[2:3] offset:-1024 ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0xfc,0xff]
+// GFX1250: flat_prefetch_b8 v[2:3] offset:-1024 ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0xfc,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_prefetch_b8 v[2:3] offset:-1024 th:TH_LOAD_NT scope:SCOPE_SE ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x14,0x00,0x02,0x00,0xfc,0xff]
+// GFX1250: flat_prefetch_b8 v[2:3] offset:-1024 th:TH_LOAD_NT scope:SCOPE_SE ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x14,0x00,0x02,0x00,0xfc,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_prefetch_b8 v[2:3] th:TH_LOAD_HT scope:SCOPE_CU ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x24,0x00,0x02,0x00,0xfc,0xff]
+// GFX1250: flat_prefetch_b8 v[2:3] th:TH_LOAD_HT ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x20,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_prefetch_b8 v[2:3] offset:64 th:TH_LOAD_NT_RT scope:SCOPE_DEV ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x24,0x00,0x02,0x00,0xfc,0xff]
+// GFX1250: flat_prefetch_b8 v[2:3] offset:64 th:TH_LOAD_NT_RT scope:SCOPE_DEV ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x48,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_prefetch_b8 v[2:3] th:TH_LOAD_HT
+// GFX1250: flat_prefetch_b8 v[2:3] th:TH_LOAD_HT ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x20,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_prefetch_b8 v[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: flat_prefetch_b8 v[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_prefetch_b8 v[2:3], off offset:-1024 th:TH_LOAD_HT scope:SCOPE_SE
+// GFX1250: global_prefetch_b8 v[2:3], off offset:-1024 th:TH_LOAD_HT scope:SCOPE_SE ; encoding: [0x7c,0x40,0x17,0xee,0x00,0x00,0x24,0x00,0x02,0x00,0xfc,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_prefetch_b8 v4, s[2:3] offset:-1024 th:TH_LOAD_NT scope:SCOPE_DEV
+// GFX1250: global_prefetch_b8 v4, s[2:3] offset:-1024 th:TH_LOAD_NT scope:SCOPE_DEV ; encoding: [0x02,0x40,0x17,0xee,0x00,0x00,0x18,0x00,0x04,0x00,0xfc,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_prefetch_b8 v4, s[2:3] th:TH_LOAD_RT_NT scope:SCOPE_CU
+// GFX1250: global_prefetch_b8 v4, s[2:3] th:TH_LOAD_RT_NT ; encoding: [0x02,0x40,0x17,0xee,0x00,0x00,0x50,0x00,0x04,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_prefetch_b8 v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: global_prefetch_b8 v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x17,0xee,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_prefetch_b8 v[2:3], off offset:64 th:TH_LOAD_NT_RT scope:SCOPE_DEV
+// GFX1250: global_prefetch_b8 v[2:3], off offset:64 th:TH_LOAD_NT_RT scope:SCOPE_DEV ; encoding: [0x7c,0x40,0x17,0xee,0x00,0x00,0x48,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b32 v1, v[2:3], off
+// GFX1250: global_load_monitor_b32 v1, v[2:3], off ; encoding: [0x7c,0x00,0x1c,0xee,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b32 v1, v[2:3], off offset:64
+// GFX1250: global_load_monitor_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x1c,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b32 v1, v[2:3], off offset:-64 th:TH_LOAD_NT_HT scope:SCOPE_DEV
+// GFX1250: global_load_monitor_b32 v1, v[2:3], off offset:-64 th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x7c,0x00,0x1c,0xee,0x01,0x00,0x68,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b32 v1, v2, s[0:1]
+// GFX1250: global_load_monitor_b32 v1, v2, s[0:1] ; encoding: [0x00,0x00,0x1c,0xee,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b32 v1, v2, s[0:1] offset:64
+// GFX1250: global_load_monitor_b32 v1, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x1c,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b32 v1, v2, s[0:1] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: global_load_monitor_b32 v1, v2, s[0:1] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x00,0x00,0x1c,0xee,0x01,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b64 v[0:1], v[2:3], off
+// GFX1250: global_load_monitor_b64 v[0:1], v[2:3], off ; encoding: [0x7c,0x40,0x1c,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b64 v[0:1], v[2:3], off offset:64
+// GFX1250: global_load_monitor_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x1c,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b64 v[0:1], v[2:3], off offset:-64 th:TH_LOAD_HT scope:SCOPE_SE
+// GFX1250: global_load_monitor_b64 v[0:1], v[2:3], off offset:-64 th:TH_LOAD_HT scope:SCOPE_SE ; encoding: [0x7c,0x40,0x1c,0xee,0x00,0x00,0x24,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b64 v[0:1], v2, s[0:1]
+// GFX1250: encoding: [0x00,0x40,0x1c,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b64 v[0:1], v2, s[0:1] offset:64
+// GFX1250: global_load_monitor_b64 v[0:1], v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x1c,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b64 v[0:1], v2, s[0:1] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: global_load_monitor_b64 v[0:1], v2, s[0:1] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x00,0x40,0x1c,0xee,0x00,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b128 v[0:3], v[4:5], off
+// GFX1250: global_load_monitor_b128 v[0:3], v[4:5], off ; encoding: [0x7c,0x80,0x1c,0xee,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b128 v[0:3], v[4:5], off offset:64
+// GFX1250: global_load_monitor_b128 v[0:3], v[4:5], off offset:64 ; encoding: [0x7c,0x80,0x1c,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b128 v[0:3], v[4:5], off offset:-64 th:TH_LOAD_NT
+// GFX1250: global_load_monitor_b128 v[0:3], v[4:5], off offset:-64 th:TH_LOAD_NT ; encoding: [0x7c,0x80,0x1c,0xee,0x00,0x00,0x10,0x00,0x04,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b128 v[0:3], v4, s[0:1]
+// GFX1250: global_load_monitor_b128 v[0:3], v4, s[0:1] ; encoding: [0x00,0x80,0x1c,0xee,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b128 v[0:3], v4, s[0:1] offset:64
+// GFX1250: global_load_monitor_b128 v[0:3], v4, s[0:1] offset:64 ; encoding: [0x00,0x80,0x1c,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b128 v[0:3], v4, s[0:1] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: global_load_monitor_b128 v[0:3], v4, s[0:1] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x00,0x80,0x1c,0xee,0x00,0x00,0x3c,0x00,0x04,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b32 v1, v2, s[4:5] offset:64 scale_offset
+// GFX1250: global_load_monitor_b32 v1, v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x00,0x1c,0xee,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset
+// GFX1250: global_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x40,0x1c,0xee,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_load_monitor_b32 v1, v[2:3]
+// GFX1250: flat_load_monitor_b32 v1, v[2:3] ; encoding: [0x7c,0x00,0x1c,0xec,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_load_monitor_b32 v1, v[2:3] offset:64
+// GFX1250: flat_load_monitor_b32 v1, v[2:3] offset:64 ; encoding: [0x7c,0x00,0x1c,0xec,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
tensor_save s[0:1]
// GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
@@ -81,10 +269,18 @@ tensor_stop th:TH_STORE_BYPASS scope:SCOPE_SYS
// GFX1250: tensor_stop th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x1b,0xee,0x00,0x00,0x3c,0x00,0x00,0x00,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+flat_atomic_add_f32 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_add_f32 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_add_f32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_add_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_add_u32 v1, v2, s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_add_u32 v1, v2, s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_add_u32 v2, v3, s[2:3] offset:-64
// GFX1250: flat_atomic_add_u32 v2, v3, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -93,6 +289,14 @@ flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64
// GFX1250: flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_add_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_add_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_and_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_and_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_and_b32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_and_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -101,18 +305,38 @@ flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64
// GFX1250: flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_and_b64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_and_b64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_cmpswap_b32 v0, v2, v[2:3], s[2:3] scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_cmpswap_b32 v0, v2, v[2:3], s[2:3] scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x11,0x01,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64
// GFX1250: flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_cmpswap_b64 v[0:1], v2, v[2:5], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_cmpswap_b64 v[0:1], v2, v[2:5], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3]
// GFX1250: flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3] ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_cond_sub_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_cond_sub_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_dec_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_dec_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_dec_u32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_dec_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -121,6 +345,14 @@ flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64
// GFX1250: flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_dec_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_dec_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_inc_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_inc_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_inc_u32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_inc_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -129,10 +361,22 @@ flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64
// GFX1250: flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_inc_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_inc_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_max_num_f32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_num_f32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_max_i32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_i32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_max_i32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_max_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -141,6 +385,14 @@ flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64
// GFX1250: flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_max_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_max_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_max_u32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_max_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -149,10 +401,22 @@ flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64
// GFX1250: flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_max_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_min_num_f32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_num_f32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_min_i32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_i32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_min_i32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_min_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -161,6 +425,14 @@ flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64
// GFX1250: flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_min_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_min_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_min_u32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_min_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -169,6 +441,14 @@ flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64
// GFX1250: flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_min_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_or_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_or_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_or_b32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_or_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -177,10 +457,22 @@ flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64
// GFX1250: flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_or_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_or_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_sub_clamp_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_sub_clamp_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_sub_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_sub_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_sub_u32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -189,6 +481,14 @@ flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64
// GFX1250: flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_sub_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_sub_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_swap_b32 v0, v2, s[2:3] scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_swap_b32 v0, v2, s[2:3] scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x11,0x01,0x00,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_swap_b32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_swap_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -197,6 +497,14 @@ flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64
// GFX1250: flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_swap_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_swap_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_xor_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_xor_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_xor_b32 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_xor_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -205,10 +513,118 @@ flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64
// GFX1250: flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_xor_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_xor_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_pk_add_f16 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_pk_add_f16 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+flat_atomic_pk_add_bf16 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_pk_add_bf16 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64
// GFX1250: flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_load_b128 v[2:5], v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b128 v[2:5], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_b32 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b32 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x05,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_b64 v[2:3], v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b64 v[2:3], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_b96 v[2:4], v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b96 v[2:4], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_b16 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_b16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_hi_b16 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_hi_b16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_hi_i8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_hi_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_hi_u8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_hi_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_i8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_u8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_i16 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_i16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_i8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_u16 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_u16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_u8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_dword v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b32 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x05,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b128 v2, v[2:5], s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b128 v2, v[2:5], s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b16 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b16 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b32 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b32 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b64 v2, v[2:3], s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b64 v2, v[2:3], s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b8 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b8 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b96 v2, v[2:4], s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b96 v2, v[2:4], s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_d16_hi_b16 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_d16_hi_b16 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_d16_hi_b8 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_d16_hi_b8 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_prefetch_b8 v3, s[2:3]
+// GFX1250: flat_prefetch_b8 v3, s[2:3] ; encoding: [0x02,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s
new file mode 100644
index 0000000..26d7ed3
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s
@@ -0,0 +1,59 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s
+
+global_load_b96 v[1:3], v[0:1], off
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned
+
+flat_load_b32 v5, v[2:3] scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}flat_load_b32 v5, v[2:3] scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
+
+flat_load_b32 v5, v[2:3] offset:32 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}flat_load_b32 v5, v[2:3] offset:32 scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
+
+flat_store_b32 v[2:3], v5 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}flat_store_b32 v[2:3], v5 scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
+
+flat_atomic_add v[2:3], v2 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}flat_atomic_add v[2:3], v2 scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
+
+global_load_b32 v5, v[2:3], off offset:32 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_load_b32 v5, v[2:3], off offset:32 scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
+
+global_store_b32 v[2:3], v5, off offset:32 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_store_b32 v[2:3], v5, off offset:32 scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
+
+global_atomic_add v[2:3], v2, off scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_atomic_add v[2:3], v2, off scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
+
+global_load_addtid_b32 v5, s[2:3] scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_load_addtid_b32 v5, s[2:3] scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
+
+global_store_addtid_b32 v5, s[2:3] scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_store_addtid_b32 v5, s[2:3] scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
+
+scratch_load_b32 v5, off, s1 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}scratch_load_b32 v5, off, s1 scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
+
+scratch_load_b32 v5, off, off offset:32 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}scratch_load_b32 v5, off, off offset:32 scale_offset
+// GFX1250-ERR-NEXT:{{^}} ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
index 20bc578..0a1d3bf 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
@@ -154,6 +154,362 @@ v_fmac_f64 v[4:5], v[2:3], v[8:9] div:2
// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2 ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x18]
// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+v_add_nc_u64 v[4:5], v[2:3], v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[254:255], v[2:3], v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x51]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64_e64 v[4:5], s[2:3], s[4:5]
+// GFX1250: v_add_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x08,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[254:255], v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], vcc, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], exec, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], -1, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0.5, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], -4.0, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0xaf123456, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0x3f717273, v[4:5]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[254:255], v[2:3], v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x51]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[254:255], v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], vcc, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], exec, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], -1, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], 0.5, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], -4.0, v[8:9]
+// GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x50]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], vcc
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xd5,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], exec
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xfd,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], 0
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x01,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], -1
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x83,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], 0.5
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xe1,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], -4.0
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xef,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_add_nc_u64 v[4:5], v[2:3], v[8:9] clamp
+// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x28,0xd5,0x02,0x11,0x02,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[254:255], v[2:3], v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x53]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64_e64 v[4:5], s[2:3], s[4:5]
+// GFX1250: v_sub_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x08,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[254:255], v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], vcc, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], exec, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], -1, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0.5, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], -4.0, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0xaf123456, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0x3f717273, v[4:5]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[254:255], v[2:3], v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x53]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[254:255], v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], vcc, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], exec, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], -1, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], 0.5, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], -4.0, v[8:9]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x52]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], vcc
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xd5,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], exec
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xfd,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], 0
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x01,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], -1
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x83,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], 0.5
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xe1,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], -4.0
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xef,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sub_nc_u64 v[4:5], v[2:3], v[8:9] clamp
+// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x29,0xd5,0x02,0x11,0x02,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[254:255], v[2:3], v[4:5]
+// GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x55]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64_e64 v[4:5], s[2:3], s[4:5]
+// GFX1250: v_mul_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x08,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[254:255], v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], vcc, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], exec, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], -1, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0.5, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], -4.0, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0xaf123456, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0x3f717273, v[4:5]
+// GFX1250: v_mul_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[254:255], v[2:3], v[8:9]
+// GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x55]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[254:255], v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], vcc, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], exec, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], -1, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], 0.5, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], -4.0, v[8:9]
+// GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x54]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], vcc
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xd5,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], exec
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xfd,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], 0
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x01,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], -1
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x83,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], 0.5
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xe1,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_mul_u64 v[4:5], v[2:3], -4.0
+// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xef,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
v_fmamk_f64 v[6:7], v[4:5], 0x405ec000, v[2:3]
// GFX1250: v_fmamk_f64 v[6:7], v[4:5], 0x405ec000, v[2:3] ; encoding: [0x04,0x05,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
index b68306d..9f50361 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
@@ -1,5 +1,8 @@
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s
+v_add_f64 v[1:2], v[1:2], v[1:2]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned
+
v_fmaak_f32 v4, v2, v6, 3 row_share:1
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
// GFX1250-ERR-NEXT:{{^}}v_fmaak_f32 v4, v2, v6, 3 row_share:1
@@ -19,3 +22,8 @@ v_fmamk_f16 v4, v2, 3, v6 row_share:1
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
// GFX1250-ERR-NEXT:{{^}}v_fmamk_f16 v4, v2, 3, v6 row_share:1
// GFX1250-ERR-NEXT:{{^}} ^
+
+v_mul_u64 v[4:5], v[2:3], v[8:9] clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_mul_u64 v[4:5], v[2:3], v[8:9] clamp
+// GFX1250-ERR-NEXT:{{^}} ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
index 0070c8a..789d6f8 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
@@ -15,3 +15,48 @@ v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3]
v_lshl_add_u64 v[2:3], v[4:5], v7, 12345
// GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, v1, v2
+// GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00]
+
+v_cvt_pk_bf16_f32 v5, v255, v255
+// GFX1250: v_cvt_pk_bf16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x6d,0xd7,0xff,0xff,0x03,0x00]
+
+v_cvt_pk_bf16_f32 v5, s1, s2
+// GFX1250: v_cvt_pk_bf16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x04,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, s105, s105
+// GFX1250: v_cvt_pk_bf16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x6d,0xd7,0x69,0xd2,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15
+// GFX1250: v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x6d,0xd7,0x6a,0xf6,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456
+// GFX1250: v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6d,0xd7,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+
+v_cvt_pk_bf16_f32 v5, ttmp15, src_scc
+// GFX1250: v_cvt_pk_bf16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x6d,0xd7,0x7b,0xfa,0x01,0x00]
+
+v_cvt_pk_bf16_f32 v5, m0, 0.5
+// GFX1250: v_cvt_pk_bf16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x6d,0xd7,0x7d,0xe0,0x01,0x00]
+
+v_cvt_pk_bf16_f32 v5, exec_lo, -1
+// GFX1250: v_cvt_pk_bf16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x6d,0xd7,0x7e,0x82,0x01,0x00]
+
+v_cvt_pk_bf16_f32 v5, exec_hi, null
+// GFX1250: v_cvt_pk_bf16_f32 v5, exec_hi, null ; encoding: [0x05,0x00,0x6d,0xd7,0x7f,0xf8,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, null, exec_lo
+// GFX1250: v_cvt_pk_bf16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x6d,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, -1, exec_hi
+// GFX1250: v_cvt_pk_bf16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x6d,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2
+// GFX1250: v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2 ; encoding: [0x05,0x00,0x6d,0xd7,0xf0,0xfa,0x00,0x08]
+
+v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4
+// GFX1250: v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x6d,0xd7,0xfd,0xd4,0x00,0x10]
+
+v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2
+// GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
index 553eacc..e1165fa 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
@@ -15,3 +15,48 @@ v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3]
v_lshl_add_u64 v[2:3], v[4:5], v7, 12345
// GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, v1, v2
+// GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00]
+
+v_cvt_pk_bf16_f32 v5, v255, v255
+// GFX1250: v_cvt_pk_bf16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x6d,0xd7,0xff,0xff,0x03,0x00]
+
+v_cvt_pk_bf16_f32 v5, s1, s2
+// GFX1250: v_cvt_pk_bf16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x04,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, s105, s105
+// GFX1250: v_cvt_pk_bf16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x6d,0xd7,0x69,0xd2,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15
+// GFX1250: v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x6d,0xd7,0x6a,0xf6,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456
+// GFX1250: v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6d,0xd7,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+
+v_cvt_pk_bf16_f32 v5, ttmp15, src_scc
+// GFX1250: v_cvt_pk_bf16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x6d,0xd7,0x7b,0xfa,0x01,0x00]
+
+v_cvt_pk_bf16_f32 v5, m0, 0.5
+// GFX1250: v_cvt_pk_bf16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x6d,0xd7,0x7d,0xe0,0x01,0x00]
+
+v_cvt_pk_bf16_f32 v5, exec_lo, -1
+// GFX1250: v_cvt_pk_bf16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x6d,0xd7,0x7e,0x82,0x01,0x00]
+
+v_cvt_pk_bf16_f32 v5, exec_hi, null
+// GFX1250: v_cvt_pk_bf16_f32 v5, exec_hi, null ; encoding: [0x05,0x00,0x6d,0xd7,0x7f,0xf8,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, null, exec_lo
+// GFX1250: v_cvt_pk_bf16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x6d,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, -1, exec_hi
+// GFX1250: v_cvt_pk_bf16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x6d,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2
+// GFX1250: v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2 ; encoding: [0x05,0x00,0x6d,0xd7,0xf0,0xfa,0x00,0x08]
+
+v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4
+// GFX1250: v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x6d,0xd7,0xfd,0xd4,0x00,0x10]
+
+v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2
+// GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s
new file mode 100644
index 0000000..bc910b9
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s
@@ -0,0 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x6d,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s
new file mode 100644
index 0000000..3bb84e2
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s
@@ -0,0 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x6d,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8-fake16.s
new file mode 100644
index 0000000..f48445f
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8-fake16.s
@@ -0,0 +1,19 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x6d,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8.s
new file mode 100644
index 0000000..d7a95f4
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8.s
@@ -0,0 +1,19 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x6d,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s
new file mode 100644
index 0000000..a17fa67
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s
@@ -0,0 +1,1483 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5]
+// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] ; encoding: [0x08,0x40,0x1f,0xcc,0x00,0x05,0x12,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,0,0]
+// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x1f,0xcc,0x00,0x05,0x12,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1] op_sel_hi:[0,0,1]
+// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x1f,0xcc,0x00,0x05,0x12,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[1,1,1]
+// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x1f,0xcc,0x00,0x05,0x12,0xfc]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[1,1,1]
+// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x1f,0xcc,0x00,0x05,0x12,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[1,1,1] neg_hi:[1,1,1]
+// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x1f,0xcc,0x00,0x05,0x12,0xfc]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0]
+// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x1f,0xcc,0x00,0x05,0x12,0x3c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0]
+// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x1f,0xcc,0x00,0x05,0x12,0x5c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x1f,0xcc,0x00,0x05,0x12,0x9c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0]
+// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x1f,0xcc,0x00,0x05,0x12,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0]
+// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x1f,0xcc,0x00,0x05,0x12,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x1f,0xcc,0x00,0x05,0x12,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] clamp
+// GFX1250: v_pk_fma_f32 v[8:9], v[0:1], v[2:3], v[4:5] clamp ; encoding: [0x08,0xc0,0x1f,0xcc,0x00,0x05,0x12,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[0:1], v[4:5], v[8:9], v[16:17]
+// GFX1250: v_pk_fma_f32 v[0:1], v[4:5], v[8:9], v[16:17] ; encoding: [0x00,0x40,0x1f,0xcc,0x04,0x11,0x42,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 1.0
+// GFX1250: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 1.0 ; encoding: [0x00,0x40,0x1f,0xcc,0x02,0x09,0xca,0x1b]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[254:255], v[8:9], v[16:17]
+// GFX1250: v_pk_mul_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x40,0x28,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[254:255], v[16:17]
+// GFX1250: v_pk_mul_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0xfe,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[2:3], v[16:17]
+// GFX1250: v_pk_mul_f32 v[4:5], v[2:3], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x02,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[100:101], v[16:17]
+// GFX1250: v_pk_mul_f32 v[4:5], v[100:101], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x64,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[254:255]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0xfd,0x03,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[2:3]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[2:3] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x05,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[100:101]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[100:101] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0xc9,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x48,0x28,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x50,0x28,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x58,0x28,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x08]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x10]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x38]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x58]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x78]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x41,0x28,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x42,0x28,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1]
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x43,0x28,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] clamp
+// GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0xc0,0x28,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_f32 v[0:1], v[2:3], 1.0
+// GFX1250: v_pk_mul_f32 v[0:1], v[2:3], 1.0 ; encoding: [0x00,0x40,0x28,0xcc,0x02,0xe5,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[254:255], v[8:9], v[16:17]
+// GFX1250: v_pk_add_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x40,0x29,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[254:255], v[16:17]
+// GFX1250: v_pk_add_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0xfe,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[2:3], v[16:17]
+// GFX1250: v_pk_add_f32 v[4:5], v[2:3], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x02,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[100:101], v[16:17]
+// GFX1250: v_pk_add_f32 v[4:5], v[100:101], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x64,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[254:255]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0xfd,0x03,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[2:3]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[2:3] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x05,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[100:101]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[100:101] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0xc9,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x48,0x29,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x50,0x29,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x58,0x29,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x08]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x10]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x38]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x58]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x78]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x41,0x29,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x42,0x29,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1]
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x43,0x29,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] clamp
+// GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0xc0,0x29,0xcc,0x08,0x21,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_f32 v[0:1], v[2:3], 1.0
+// GFX1250: v_pk_add_f32 v[0:1], v[2:3], 1.0 ; encoding: [0x00,0x40,0x29,0xcc,0x02,0xe5,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, v1, v2, v3
+// GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x2d,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, s1, v2, v3
+// GFX1250: v_pk_add_min_i16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x2d,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, s1, v2, v3 clamp
+// GFX1250: v_pk_add_min_i16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x2d,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, 100, v2, v3
+// GFX1250: v_pk_add_min_i16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x2d,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, 100, 100, v3
+// GFX1250: v_pk_add_min_i16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x2d,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, 100, 100, 100
+// GFX1250: v_pk_add_min_i16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x2d,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, v1, 100, 100
+// GFX1250: v_pk_add_min_i16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x2d,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, v1, v2, 100
+// GFX1250: v_pk_add_min_i16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x2d,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, v1, v2, v3 op_sel:[0,1,0]
+// GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x2d,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, v1, v2, v3 op_sel:[0,1,1]
+// GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x2d,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, v1, v2, v3 op_sel:[1,0,0]
+// GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x2d,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, v1, v2, v3 op_sel_hi:[1,0,0]
+// GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x2d,0xcc,0x01,0x05,0x0e,0x0c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, v1, v2, v3 op_sel_hi:[0,1,1]
+// GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x2d,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+// GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x2d,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_i16 v10, s1, 100, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp
+// GFX1250: v_pk_add_min_i16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x2d,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, v1, v2, v3
+// GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x14,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, s1, v2, v3
+// GFX1250: v_pk_add_max_i16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x14,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, s1, v2, v3 clamp
+// GFX1250: v_pk_add_max_i16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x14,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, 100, v2, v3
+// GFX1250: v_pk_add_max_i16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x14,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, 100, 100, v3
+// GFX1250: v_pk_add_max_i16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x14,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, 100, 100, 100
+// GFX1250: v_pk_add_max_i16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x14,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, v1, 100, 100
+// GFX1250: v_pk_add_max_i16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x14,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, v1, v2, 100
+// GFX1250: v_pk_add_max_i16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x14,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, v1, v2, v3 op_sel:[0,1,0]
+// GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x14,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, v1, v2, v3 op_sel:[0,1,1]
+// GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x14,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, v1, v2, v3 op_sel:[1,0,0]
+// GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x14,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, v1, v2, v3 op_sel_hi:[1,0,0]
+// GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x14,0xcc,0x01,0x05,0x0e,0x0c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, v1, v2, v3 op_sel_hi:[0,1,1]
+// GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x14,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+// GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x14,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_i16 v10, s1, 100, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp
+// GFX1250: v_pk_add_max_i16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x14,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, v1, v2, v3
+// GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x2e,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, s1, v2, v3
+// GFX1250: v_pk_add_min_u16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x2e,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, s1, v2, v3 clamp
+// GFX1250: v_pk_add_min_u16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x2e,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, 100, v2, v3
+// GFX1250: v_pk_add_min_u16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x2e,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, 100, 100, v3
+// GFX1250: v_pk_add_min_u16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x2e,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, 100, 100, 100
+// GFX1250: v_pk_add_min_u16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x2e,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, v1, 100, 100
+// GFX1250: v_pk_add_min_u16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x2e,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, v1, v2, 100
+// GFX1250: v_pk_add_min_u16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x2e,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, v1, v2, v3 op_sel:[0,1,0]
+// GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x2e,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, v1, v2, v3 op_sel:[0,1,1]
+// GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x2e,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, v1, v2, v3 op_sel:[1,0,0]
+// GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x2e,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, v1, v2, v3 op_sel_hi:[1,0,0]
+// GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x2e,0xcc,0x01,0x05,0x0e,0x0c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, v1, v2, v3 op_sel_hi:[0,1,1]
+// GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x2e,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+// GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x2e,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_min_u16 v10, s1, 100, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp
+// GFX1250: v_pk_add_min_u16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x2e,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, v1, v2, v3
+// GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x15,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, s1, v2, v3
+// GFX1250: v_pk_add_max_u16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x15,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, s1, v2, v3 clamp
+// GFX1250: v_pk_add_max_u16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x15,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, 100, v2, v3
+// GFX1250: v_pk_add_max_u16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x15,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, 100, 100, v3
+// GFX1250: v_pk_add_max_u16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x15,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, 100, 100, 100
+// GFX1250: v_pk_add_max_u16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x15,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, v1, 100, 100
+// GFX1250: v_pk_add_max_u16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x15,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, v1, v2, 100
+// GFX1250: v_pk_add_max_u16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x15,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, v1, v2, v3 op_sel:[0,1,0]
+// GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x15,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, v1, v2, v3 op_sel:[0,1,1]
+// GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x15,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, v1, v2, v3 op_sel:[1,0,0]
+// GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x15,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, v1, v2, v3 op_sel_hi:[1,0,0]
+// GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x15,0xcc,0x01,0x05,0x0e,0x0c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, v1, v2, v3 op_sel_hi:[0,1,1]
+// GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x15,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+// GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x15,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_max_u16 v10, s1, 100, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp
+// GFX1250: v_pk_add_max_u16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x15,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, v1, v2, v3
+// GFX1250: v_pk_min3_i16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x31,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, s1, v2, v3
+// GFX1250: v_pk_min3_i16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x31,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, s1, v2, v3 clamp
+// GFX1250: v_pk_min3_i16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x31,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, 100, v2, v3
+// GFX1250: v_pk_min3_i16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x31,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, 100, 100, v3
+// GFX1250: v_pk_min3_i16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x31,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, 100, 100, 100
+// GFX1250: v_pk_min3_i16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x31,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, v1, 100, 100
+// GFX1250: v_pk_min3_i16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x31,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, v1, v2, 100
+// GFX1250: v_pk_min3_i16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x31,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, v1, v2, v3 op_sel:[0,1,0]
+// GFX1250: v_pk_min3_i16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x31,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, v1, v2, v3 op_sel:[0,1,1]
+// GFX1250: v_pk_min3_i16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x31,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, v1, v2, v3 op_sel:[1,0,0]
+// GFX1250: v_pk_min3_i16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x31,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, v1, v2, v3 op_sel_hi:[1,0,0]
+// GFX1250: v_pk_min3_i16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x31,0xcc,0x01,0x05,0x0e,0x0c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, v1, v2, v3 op_sel_hi:[0,1,1]
+// GFX1250: v_pk_min3_i16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x31,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+// GFX1250: v_pk_min3_i16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x31,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_i16 v10, s1, 100, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp
+// GFX1250: v_pk_min3_i16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x31,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, v1, v2, v3
+// GFX1250: v_pk_max3_i16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x2f,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, s1, v2, v3
+// GFX1250: v_pk_max3_i16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x2f,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, s1, v2, v3 clamp
+// GFX1250: v_pk_max3_i16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x2f,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, 100, v2, v3
+// GFX1250: v_pk_max3_i16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x2f,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, 100, 100, v3
+// GFX1250: v_pk_max3_i16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x2f,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, 100, 100, 100
+// GFX1250: v_pk_max3_i16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x2f,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, v1, 100, 100
+// GFX1250: v_pk_max3_i16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x2f,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, v1, v2, 100
+// GFX1250: v_pk_max3_i16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x2f,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, v1, v2, v3 op_sel:[0,1,0]
+// GFX1250: v_pk_max3_i16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x2f,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, v1, v2, v3 op_sel:[0,1,1]
+// GFX1250: v_pk_max3_i16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x2f,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, v1, v2, v3 op_sel:[1,0,0]
+// GFX1250: v_pk_max3_i16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x2f,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, v1, v2, v3 op_sel_hi:[1,0,0]
+// GFX1250: v_pk_max3_i16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x2f,0xcc,0x01,0x05,0x0e,0x0c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, v1, v2, v3 op_sel_hi:[0,1,1]
+// GFX1250: v_pk_max3_i16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x2f,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+// GFX1250: v_pk_max3_i16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x2f,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_i16 v10, s1, 100, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp
+// GFX1250: v_pk_max3_i16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x2f,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, v1, v2, v3
+// GFX1250: v_pk_min3_u16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x32,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, s1, v2, v3
+// GFX1250: v_pk_min3_u16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x32,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, s1, v2, v3 clamp
+// GFX1250: v_pk_min3_u16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x32,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, 100, v2, v3
+// GFX1250: v_pk_min3_u16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x32,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, 100, 100, v3
+// GFX1250: v_pk_min3_u16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x32,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, 100, 100, 100
+// GFX1250: v_pk_min3_u16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x32,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, v1, 100, 100
+// GFX1250: v_pk_min3_u16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x32,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, v1, v2, 100
+// GFX1250: v_pk_min3_u16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x32,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, v1, v2, v3 op_sel:[0,1,0]
+// GFX1250: v_pk_min3_u16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x32,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, v1, v2, v3 op_sel:[0,1,1]
+// GFX1250: v_pk_min3_u16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x32,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, v1, v2, v3 op_sel:[1,0,0]
+// GFX1250: v_pk_min3_u16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x32,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, v1, v2, v3 op_sel_hi:[1,0,0]
+// GFX1250: v_pk_min3_u16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x32,0xcc,0x01,0x05,0x0e,0x0c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, v1, v2, v3 op_sel_hi:[0,1,1]
+// GFX1250: v_pk_min3_u16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x32,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+// GFX1250: v_pk_min3_u16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x32,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_u16 v10, s1, 100, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp
+// GFX1250: v_pk_min3_u16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x32,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, v1, v2, v3
+// GFX1250: v_pk_max3_u16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x30,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, s1, v2, v3
+// GFX1250: v_pk_max3_u16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x30,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, s1, v2, v3 clamp
+// GFX1250: v_pk_max3_u16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x30,0xcc,0x01,0x04,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, 100, v2, v3
+// GFX1250: v_pk_max3_u16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x30,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, 100, 100, v3
+// GFX1250: v_pk_max3_u16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x30,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, 100, 100, 100
+// GFX1250: v_pk_max3_u16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x30,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, v1, 100, 100
+// GFX1250: v_pk_max3_u16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x30,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, v1, v2, 100
+// GFX1250: v_pk_max3_u16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x30,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, v1, v2, v3 op_sel:[0,1,0]
+// GFX1250: v_pk_max3_u16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x30,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, v1, v2, v3 op_sel:[0,1,1]
+// GFX1250: v_pk_max3_u16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x30,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, v1, v2, v3 op_sel:[1,0,0]
+// GFX1250: v_pk_max3_u16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x30,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, v1, v2, v3 op_sel_hi:[1,0,0]
+// GFX1250: v_pk_max3_u16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x30,0xcc,0x01,0x05,0x0e,0x0c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, v1, v2, v3 op_sel_hi:[0,1,1]
+// GFX1250: v_pk_max3_u16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x30,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+// GFX1250: v_pk_max3_u16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x30,0xcc,0x01,0x05,0x0e,0x14]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_u16 v10, s1, 100, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp
+// GFX1250: v_pk_max3_u16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x30,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, v1, v2
+// GFX1250: v_pk_add_bf16 v5, v1, v2 ; encoding: [0x05,0x40,0x23,0xcc,0x01,0x05,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, v255, v255
+// GFX1250: v_pk_add_bf16 v5, v255, v255 ; encoding: [0x05,0x40,0x23,0xcc,0xff,0xff,0x03,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, s1, s2
+// GFX1250: v_pk_add_bf16 v5, s1, s2 ; encoding: [0x05,0x40,0x23,0xcc,0x01,0x04,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, s105, s105
+// GFX1250: v_pk_add_bf16 v5, s105, s105 ; encoding: [0x05,0x40,0x23,0xcc,0x69,0xd2,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, vcc_lo, ttmp15
+// GFX1250: v_pk_add_bf16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x23,0xcc,0x6a,0xf6,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, vcc_hi, 0xfe0b
+// GFX1250: v_pk_add_bf16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x23,0xcc,0x6b,0xfe,0x01,0x18,0x0b,0xfe,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, ttmp15, src_scc
+// GFX1250: v_pk_add_bf16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x23,0xcc,0x7b,0xfa,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, m0, 0.5
+// GFX1250: v_pk_add_bf16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x23,0xcc,0x7d,0xe0,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, exec_lo, -1
+// GFX1250: v_pk_add_bf16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x23,0xcc,0x7e,0x82,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, exec_hi, null
+// GFX1250: v_pk_add_bf16 v5, exec_hi, null ; encoding: [0x05,0x40,0x23,0xcc,0x7f,0xf8,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, null, exec_lo
+// GFX1250: v_pk_add_bf16 v5, null, exec_lo ; encoding: [0x05,0x40,0x23,0xcc,0x7c,0xfc,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0]
+// GFX1250: v_pk_add_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x23,0xcc,0xc1,0xfe,0x00,0x20]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1]
+// GFX1250: v_pk_add_bf16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x23,0xcc,0xf0,0xfa,0x00,0x58]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0]
+// GFX1250: v_pk_add_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x23,0xcc,0xfd,0xd4,0x00,0x10]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_add_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp
+// GFX1250: v_pk_add_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x23,0xcc,0xff,0xd6,0x00,0x68,0x0b,0xfe,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, v1, v2
+// GFX1250: v_pk_mul_bf16 v5, v1, v2 ; encoding: [0x05,0x40,0x2a,0xcc,0x01,0x05,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, v255, v255
+// GFX1250: v_pk_mul_bf16 v5, v255, v255 ; encoding: [0x05,0x40,0x2a,0xcc,0xff,0xff,0x03,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, s1, s2
+// GFX1250: v_pk_mul_bf16 v5, s1, s2 ; encoding: [0x05,0x40,0x2a,0xcc,0x01,0x04,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, s105, s105
+// GFX1250: v_pk_mul_bf16 v5, s105, s105 ; encoding: [0x05,0x40,0x2a,0xcc,0x69,0xd2,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, vcc_lo, ttmp15
+// GFX1250: v_pk_mul_bf16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x2a,0xcc,0x6a,0xf6,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, vcc_hi, 0xfe0b
+// GFX1250: v_pk_mul_bf16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x2a,0xcc,0x6b,0xfe,0x01,0x18,0x0b,0xfe,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, ttmp15, src_scc
+// GFX1250: v_pk_mul_bf16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x2a,0xcc,0x7b,0xfa,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, m0, 0.5
+// GFX1250: v_pk_mul_bf16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x2a,0xcc,0x7d,0xe0,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, exec_lo, -1
+// GFX1250: v_pk_mul_bf16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x2a,0xcc,0x7e,0x82,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, exec_hi, null
+// GFX1250: v_pk_mul_bf16 v5, exec_hi, null ; encoding: [0x05,0x40,0x2a,0xcc,0x7f,0xf8,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, null, exec_lo
+// GFX1250: v_pk_mul_bf16 v5, null, exec_lo ; encoding: [0x05,0x40,0x2a,0xcc,0x7c,0xfc,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0]
+// GFX1250: v_pk_mul_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x2a,0xcc,0xc1,0xfe,0x00,0x20]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1]
+// GFX1250: v_pk_mul_bf16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x2a,0xcc,0xf0,0xfa,0x00,0x58]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0]
+// GFX1250: v_pk_mul_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x2a,0xcc,0xfd,0xd4,0x00,0x10]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_mul_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp
+// GFX1250: v_pk_mul_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x2a,0xcc,0xff,0xd6,0x00,0x68,0x0b,0xfe,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, v1, v2
+// GFX1250: v_pk_max_num_bf16 v5, v1, v2 ; encoding: [0x05,0x40,0x2c,0xcc,0x01,0x05,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, v255, v255
+// GFX1250: v_pk_max_num_bf16 v5, v255, v255 ; encoding: [0x05,0x40,0x2c,0xcc,0xff,0xff,0x03,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, s1, s2
+// GFX1250: v_pk_max_num_bf16 v5, s1, s2 ; encoding: [0x05,0x40,0x2c,0xcc,0x01,0x04,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, s105, s105
+// GFX1250: v_pk_max_num_bf16 v5, s105, s105 ; encoding: [0x05,0x40,0x2c,0xcc,0x69,0xd2,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, vcc_lo, ttmp15
+// GFX1250: v_pk_max_num_bf16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x2c,0xcc,0x6a,0xf6,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, vcc_hi, 0xfe0b
+// GFX1250: v_pk_max_num_bf16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x2c,0xcc,0x6b,0xfe,0x01,0x18,0x0b,0xfe,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, ttmp15, src_scc
+// GFX1250: v_pk_max_num_bf16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x2c,0xcc,0x7b,0xfa,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, m0, 0.5
+// GFX1250: v_pk_max_num_bf16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x2c,0xcc,0x7d,0xe0,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, exec_lo, -1
+// GFX1250: v_pk_max_num_bf16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x2c,0xcc,0x7e,0x82,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, exec_hi, null
+// GFX1250: v_pk_max_num_bf16 v5, exec_hi, null ; encoding: [0x05,0x40,0x2c,0xcc,0x7f,0xf8,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, null, exec_lo
+// GFX1250: v_pk_max_num_bf16 v5, null, exec_lo ; encoding: [0x05,0x40,0x2c,0xcc,0x7c,0xfc,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0]
+// GFX1250: v_pk_max_num_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x2c,0xcc,0xc1,0xfe,0x00,0x20]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1]
+// GFX1250: v_pk_max_num_bf16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x2c,0xcc,0xf0,0xfa,0x00,0x58]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0]
+// GFX1250: v_pk_max_num_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x2c,0xcc,0xfd,0xd4,0x00,0x10]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max_num_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp
+// GFX1250: v_pk_max_num_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x2c,0xcc,0xff,0xd6,0x00,0x68,0x0b,0xfe,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, v1, v2
+// GFX1250: v_pk_min_num_bf16 v5, v1, v2 ; encoding: [0x05,0x40,0x2b,0xcc,0x01,0x05,0x02,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, v255, v255
+// GFX1250: v_pk_min_num_bf16 v5, v255, v255 ; encoding: [0x05,0x40,0x2b,0xcc,0xff,0xff,0x03,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, s1, s2
+// GFX1250: v_pk_min_num_bf16 v5, s1, s2 ; encoding: [0x05,0x40,0x2b,0xcc,0x01,0x04,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, s105, s105
+// GFX1250: v_pk_min_num_bf16 v5, s105, s105 ; encoding: [0x05,0x40,0x2b,0xcc,0x69,0xd2,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, vcc_lo, ttmp15
+// GFX1250: v_pk_min_num_bf16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x2b,0xcc,0x6a,0xf6,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, vcc_hi, 0xfe0b
+// GFX1250: v_pk_min_num_bf16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x2b,0xcc,0x6b,0xfe,0x01,0x18,0x0b,0xfe,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, ttmp15, src_scc
+// GFX1250: v_pk_min_num_bf16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x2b,0xcc,0x7b,0xfa,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, m0, 0.5
+// GFX1250: v_pk_min_num_bf16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x2b,0xcc,0x7d,0xe0,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, exec_lo, -1
+// GFX1250: v_pk_min_num_bf16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x2b,0xcc,0x7e,0x82,0x01,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, exec_hi, null
+// GFX1250: v_pk_min_num_bf16 v5, exec_hi, null ; encoding: [0x05,0x40,0x2b,0xcc,0x7f,0xf8,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, null, exec_lo
+// GFX1250: v_pk_min_num_bf16 v5, null, exec_lo ; encoding: [0x05,0x40,0x2b,0xcc,0x7c,0xfc,0x00,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0]
+// GFX1250: v_pk_min_num_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x2b,0xcc,0xc1,0xfe,0x00,0x20]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1]
+// GFX1250: v_pk_min_num_bf16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x2b,0xcc,0xf0,0xfa,0x00,0x58]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0]
+// GFX1250: v_pk_min_num_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x2b,0xcc,0xfd,0xd4,0x00,0x10]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min_num_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp
+// GFX1250: v_pk_min_num_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x2b,0xcc,0xff,0xd6,0x00,0x68,0x0b,0xfe,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, v1, v2, s3
+// GFX1250: v_pk_fma_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x11,0xcc,0x01,0x05,0x0e,0x18]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, v255, s2, s105
+// GFX1250: v_pk_fma_bf16 v5, v255, s2, s105 ; encoding: [0x05,0x40,0x11,0xcc,0xff,0x05,0xa4,0x19]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, s1, v255, exec_hi
+// GFX1250: v_pk_fma_bf16 v5, s1, v255, exec_hi ; encoding: [0x05,0x40,0x11,0xcc,0x01,0xfe,0xff,0x19]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, s105, s105, exec_lo
+// GFX1250: v_pk_fma_bf16 v5, s105, s105, exec_lo ; encoding: [0x05,0x40,0x11,0xcc,0x69,0xd2,0xf8,0x19]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, vcc_lo, ttmp15, v3
+// GFX1250: v_pk_fma_bf16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x40,0x11,0xcc,0x6a,0xf6,0x0c,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, vcc_hi, 0xfe0b, v255
+// GFX1250: v_pk_fma_bf16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x40,0x11,0xcc,0x6b,0xfe,0xfd,0x1f,0x0b,0xfe,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, ttmp15, src_scc, ttmp15
+// GFX1250: v_pk_fma_bf16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x11,0xcc,0x7b,0xfa,0xed,0x19]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0]
+// GFX1250: v_pk_fma_bf16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x11,0xcc,0x7d,0xe0,0xf5,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1]
+// GFX1250: v_pk_fma_bf16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x11,0xcc,0x7e,0x82,0xad,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, exec_hi, null, vcc_lo op_sel_hi:[0,1,0]
+// GFX1250: v_pk_fma_bf16 v5, exec_hi, null, vcc_lo op_sel_hi:[0,1,0] ; encoding: [0x05,0x00,0x11,0xcc,0x7f,0xf8,0xa8,0x11]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, null, exec_lo, 0xfe0b op_sel:[1,1,1] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX1250: v_pk_fma_bf16 v5, null, exec_lo, 0xfe0b op_sel:[1,1,1] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x05,0x39,0x11,0xcc,0x7c,0xfc,0xfc,0x2b,0x0b,0xfe,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, -1, exec_hi, src_scc op_sel:[0,0,0] op_sel_hi:[1,1,1] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX1250: v_pk_fma_bf16 v5, -1, exec_hi, src_scc neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x05,0x42,0x11,0xcc,0xc1,0xfe,0xf4,0x5b]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+// GFX1250: v_pk_fma_bf16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x05,0x4c,0x11,0xcc,0xf0,0xfa,0xc0,0x93]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] neg_lo:[0,0,0] neg_hi:[0,0,0]
+// GFX1250: v_pk_fma_bf16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x11,0xcc,0xfd,0xd4,0x04,0x0b]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_fma_bf16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[1,1,1] neg_hi:[1,1,1] clamp
+// GFX1250: v_pk_fma_bf16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[1,1,1] neg_hi:[1,1,1] clamp ; encoding: [0xff,0xa7,0x11,0xcc,0xff,0xd6,0xf0,0xf9,0x0b,0xfe,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0]
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x36,0xcc,0x01,0x03,0x10,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1]
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x36,0xcc,0x01,0x03,0x10,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[1,1,1]
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0xfc]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4 neg_hi:[1,1,1]
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x36,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1]
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x36,0xcc,0x01,0x03,0x10,0xfc]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[1,0,0]
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x3c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[0,1,0]
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x5c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[0,0,1]
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x9c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4 neg_hi:[1,0,0]
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x36,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4 neg_hi:[0,1,0]
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x36,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4 neg_hi:[0,0,1]
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x36,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v8, v1, s1, v4 clamp
+// GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 clamp ; encoding: [0x08,0xc0,0x36,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v1, v4, v9, v16
+// GFX1250: v_pk_minimum3_f16 v1, v4, v9, v16 ; encoding: [0x01,0x40,0x36,0xcc,0x04,0x13,0x42,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_minimum3_f16 v1, v2, v5, 1.0
+// GFX1250: v_pk_minimum3_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x36,0xcc,0x02,0x0b,0xca,0x1b]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0]
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x37,0xcc,0x01,0x03,0x10,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1]
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x37,0xcc,0x01,0x03,0x10,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[1,1,1]
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0xfc]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4 neg_hi:[1,1,1]
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x37,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1]
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x37,0xcc,0x01,0x03,0x10,0xfc]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[1,0,0]
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x3c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[0,1,0]
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x5c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[0,0,1]
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x9c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4 neg_hi:[1,0,0]
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x37,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4 neg_hi:[0,1,0]
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x37,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4 neg_hi:[0,0,1]
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x37,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v8, v1, s1, v4 clamp
+// GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 clamp ; encoding: [0x08,0xc0,0x37,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v1, v4, v9, v16
+// GFX1250: v_pk_maximum3_f16 v1, v4, v9, v16 ; encoding: [0x01,0x40,0x37,0xcc,0x04,0x13,0x42,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_maximum3_f16 v1, v2, v5, 1.0
+// GFX1250: v_pk_maximum3_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x37,0xcc,0x02,0x0b,0xca,0x1b]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0]
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x38,0xcc,0x01,0x03,0x10,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1]
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x38,0xcc,0x01,0x03,0x10,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1]
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0xfc]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[1,1,1]
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x38,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1]
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x38,0xcc,0x01,0x03,0x10,0xfc]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,0,0]
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x3c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[0,1,0]
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x5c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[0,0,1]
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x9c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[1,0,0]
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x38,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[0,1,0]
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x38,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[0,0,1]
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x38,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v8, v1, s1, v4 clamp
+// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 clamp ; encoding: [0x08,0xc0,0x38,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v1, v4, v9, v16
+// GFX1250: v_pk_min3_num_f16 v1, v4, v9, v16 ; encoding: [0x01,0x40,0x38,0xcc,0x04,0x13,0x42,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_min3_num_f16 v1, v2, v5, 1.0
+// GFX1250: v_pk_min3_num_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x38,0xcc,0x02,0x0b,0xca,0x1b]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0]
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x39,0xcc,0x01,0x03,0x10,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1]
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x39,0xcc,0x01,0x03,0x10,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1]
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0xfc]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[1,1,1]
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x39,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1]
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x39,0xcc,0x01,0x03,0x10,0xfc]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,0,0]
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x3c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[0,1,0]
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x5c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[0,0,1]
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x9c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[1,0,0]
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x39,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[0,1,0]
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x39,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[0,0,1]
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x39,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v8, v1, s1, v4 clamp
+// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 clamp ; encoding: [0x08,0xc0,0x39,0xcc,0x01,0x03,0x10,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v1, v4, v9, v16
+// GFX1250: v_pk_max3_num_f16 v1, v4, v9, v16 ; encoding: [0x01,0x40,0x39,0xcc,0x04,0x13,0x42,0x1c]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_pk_max3_num_f16 v1, v2, v5, 1.0
+// GFX1250: v_pk_max3_num_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x39,0xcc,0x02,0x0b,0xca,0x1b]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, v1, v2, s3
+// GFX1250: v_fma_mix_f32_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x05,0x0e,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, v255, v255, s105
+// GFX1250: v_fma_mix_f32_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3d,0xcc,0xff,0xff,0xa7,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, s1, s2, v3
+// GFX1250: v_fma_mix_f32_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x04,0x0c,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, s105, s105, m0
+// GFX1250: v_fma_mix_f32_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3d,0xcc,0x69,0xd2,0xf4,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, vcc_lo, ttmp15, ttmp15
+// GFX1250: v_fma_mix_f32_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3d,0xcc,0x6a,0xf6,0xec,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, vcc_hi, src_scc, v255
+// GFX1250: v_fma_mix_f32_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3d,0xcc,0x6b,0xfa,0xfd,0x07]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, |ttmp15|, 0.5, -vcc_hi
+// GFX1250: v_fma_mix_f32_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3d,0xcc,0x7b,0xe0,0xad,0x81]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, -m0, -1, |vcc_lo|
+// GFX1250: v_fma_mix_f32_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3d,0xcc,0x7d,0x82,0xa9,0x21]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, -|exec_lo|, null, -|src_scc|
+// GFX1250: v_fma_mix_f32_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3d,0xcc,0x7e,0xf8,0xf4,0xa3]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1]
+// GFX1250: v_fma_mix_f32_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3d,0xcc,0x7f,0xfc,0xf8,0xf9]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1]
+// GFX1250: v_fma_mix_f32_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3d,0xcc,0x7c,0xfe,0xc0,0x03]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0]
+// GFX1250: v_fma_mix_f32_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3d,0xcc,0xc1,0xfa,0x04,0x53]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0]
+// GFX1250: v_fma_mix_f32_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3d,0xcc,0xf0,0xd4,0xfc,0xc9]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp
+// GFX1250: v_fma_mix_f32_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3d,0xcc,0xfd,0xd6,0xf0,0x61]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, v1, v2, s3
+// GFX1250: v_fma_mixlo_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x05,0x0e,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, v255, v255, s105
+// GFX1250: v_fma_mixlo_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3e,0xcc,0xff,0xff,0xa7,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, s1, s2, v3
+// GFX1250: v_fma_mixlo_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x04,0x0c,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, s105, s105, m0
+// GFX1250: v_fma_mixlo_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3e,0xcc,0x69,0xd2,0xf4,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, vcc_lo, ttmp15, ttmp15
+// GFX1250: v_fma_mixlo_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3e,0xcc,0x6a,0xf6,0xec,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, vcc_hi, src_scc, v255
+// GFX1250: v_fma_mixlo_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3e,0xcc,0x6b,0xfa,0xfd,0x07]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, |ttmp15|, 0.5, -vcc_hi
+// GFX1250: v_fma_mixlo_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3e,0xcc,0x7b,0xe0,0xad,0x81]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, -m0, -1, |vcc_lo|
+// GFX1250: v_fma_mixlo_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3e,0xcc,0x7d,0x82,0xa9,0x21]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, -|exec_lo|, null, -|src_scc|
+// GFX1250: v_fma_mixlo_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3e,0xcc,0x7e,0xf8,0xf4,0xa3]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1]
+// GFX1250: v_fma_mixlo_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3e,0xcc,0x7f,0xfc,0xf8,0xf9]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1]
+// GFX1250: v_fma_mixlo_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3e,0xcc,0x7c,0xfe,0xc0,0x03]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0]
+// GFX1250: v_fma_mixlo_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3e,0xcc,0xc1,0xfa,0x04,0x53]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0]
+// GFX1250: v_fma_mixlo_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3e,0xcc,0xf0,0xd4,0xfc,0xc9]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp
+// GFX1250: v_fma_mixlo_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3e,0xcc,0xfd,0xd6,0xf0,0x61]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, v1, v2, s3
+// GFX1250: v_fma_mixhi_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x05,0x0e,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, v255, v255, s105
+// GFX1250: v_fma_mixhi_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3f,0xcc,0xff,0xff,0xa7,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, s1, s2, v3
+// GFX1250: v_fma_mixhi_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x04,0x0c,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, s105, s105, m0
+// GFX1250: v_fma_mixhi_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3f,0xcc,0x69,0xd2,0xf4,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, vcc_lo, ttmp15, ttmp15
+// GFX1250: v_fma_mixhi_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3f,0xcc,0x6a,0xf6,0xec,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, vcc_hi, src_scc, v255
+// GFX1250: v_fma_mixhi_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3f,0xcc,0x6b,0xfa,0xfd,0x07]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, |ttmp15|, 0.5, -vcc_hi
+// GFX1250: v_fma_mixhi_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3f,0xcc,0x7b,0xe0,0xad,0x81]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, -m0, -1, |vcc_lo|
+// GFX1250: v_fma_mixhi_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3f,0xcc,0x7d,0x82,0xa9,0x21]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, -|exec_lo|, null, -|src_scc|
+// GFX1250: v_fma_mixhi_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3f,0xcc,0x7e,0xf8,0xf4,0xa3]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1]
+// GFX1250: v_fma_mixhi_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3f,0xcc,0x7f,0xfc,0xf8,0xf9]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1]
+// GFX1250: v_fma_mixhi_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3f,0xcc,0x7c,0xfe,0xc0,0x03]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0]
+// GFX1250: v_fma_mixhi_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3f,0xcc,0xc1,0xfa,0x04,0x53]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0]
+// GFX1250: v_fma_mixhi_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3f,0xcc,0xf0,0xd4,0xfc,0xc9]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp
+// GFX1250: v_fma_mixhi_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3f,0xcc,0xfd,0xd6,0xf0,0x61]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p_alias.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p_alias.s
new file mode 100644
index 0000000..8d5c114
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p_alias.s
@@ -0,0 +1,5 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+
+v_fma_mix_f32_f16 v5, v1, v2, s3
+// GFX1250: v_fma_mix_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x20,0xcc,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
index e81b6a1..d8dfd1e 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
@@ -923,6 +923,71 @@ v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47]
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19]
// GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x1c]
// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
index 47445d3..421d96b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
@@ -363,6 +363,82 @@ v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:2
v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,0,1]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:-1
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid matrix_b_fmt value
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:xxx
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid matrix_b_fmt value
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47]
+// GFX1250-ERR-NEXT: {{^}} ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP8
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP8
+// GFX1250-ERR-NEXT: {{^}} ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF8
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
+// GFX1250-ERR-NEXT: {{^}} ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP6
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
+// GFX1250-ERR-NEXT: {{^}} ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF6
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
+// GFX1250-ERR-NEXT: {{^}} ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP4
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
+// GFX1250-ERR-NEXT: {{^}} ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP8
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP8
+// GFX1250-ERR-NEXT: {{^}} ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF8
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
+// GFX1250-ERR-NEXT: {{^}} ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP6
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
+// GFX1250-ERR-NEXT: {{^}} ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF6
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
+// GFX1250-ERR-NEXT: {{^}} ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP4
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
+// GFX1250-ERR-NEXT: {{^}} ^
+
v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_err.s b/llvm/test/MC/AMDGPU/gfx1250_err.s
index e04c6aa..e4598fe 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_err.s
@@ -136,3 +136,23 @@ v_fmaak_f64 v[4:5], 0x7e8, v[8:9], lit64(0x7e8)
// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
// GFX1250-ERR: v_fmaak_f64 v[4:5], 0x7e8, v[8:9], lit64(0x7e8)
// GFX1250-ERR: ^
+
+v_pk_add_min_i16 v10, -v1, v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR: v_pk_add_min_i16 v10, -v1, v2, v3
+// GFX1250-ERR: ^
+
+v_pk_add_min_i16 v10, sext(v1), v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR: v_pk_add_min_i16 v10, sext(v1), v2, v3
+// GFX1250-ERR: ^
+
+v_pk_add_min_i16 v10, v1, v2, v3 neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR: v_pk_add_min_i16 v10, v1, v2, v3 neg_lo:[1,0,0]
+// GFX1250-ERR: ^
+
+v_pk_add_min_i16 v10, v1, v2, v3 neg_hi:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR: v_pk_add_min_i16 v10, v1, v2, v3 neg_hi:[1,0,0]
+// GFX1250-ERR: ^
diff --git a/llvm/test/MC/AMDGPU/gfx7_err_pos.s b/llvm/test/MC/AMDGPU/gfx7_err_pos.s
index 9dcbd4a..7b6b241 100644
--- a/llvm/test/MC/AMDGPU/gfx7_err_pos.s
+++ b/llvm/test/MC/AMDGPU/gfx7_err_pos.s
@@ -44,3 +44,16 @@ s_load_dword s5, s[2:3], glc
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: cache policy is not supported for SMRD instructions
// CHECK-NEXT:{{^}}s_load_dword s5, s[2:3], glc
// CHECK-NEXT:{{^}} ^
+
+//==============================================================================
+// not a valid operand
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK-NEXT:{{^}} ^
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK-NEXT:{{^}} ^
diff --git a/llvm/test/MC/AMDGPU/gfx8_err_pos.s b/llvm/test/MC/AMDGPU/gfx8_err_pos.s
index 1e8457d..a475c73 100644
--- a/llvm/test/MC/AMDGPU/gfx8_err_pos.s
+++ b/llvm/test/MC/AMDGPU/gfx8_err_pos.s
@@ -49,3 +49,13 @@ v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERV
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
// CHECK-NEXT:{{^}}v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 src1_sel:WORD_0
// CHECK-NEXT:{{^}} ^
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK-NEXT:{{^}} ^
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK-NEXT:{{^}} ^
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s
index f3f4cae..a1cd9ce 100644
--- a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s
@@ -2829,6 +2829,18 @@ v_alignbit_b32 v5, v1, v2, src_execz
v_alignbit_b32 v5, v1, v2, src_scc
// CHECK: [0x05,0x00,0xce,0xd1,0x01,0x05,0xf6,0x03]
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04]
+// CHECK: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04]
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04]
+// CHECK: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04]
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04]
+// CHECK: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04]
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04]
+// CHECK: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04]
+
v_alignbyte_b32 v5, v1, v2, v3
// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04]
@@ -3000,6 +3012,18 @@ v_alignbyte_b32 v5, v1, v2, src_execz
v_alignbyte_b32 v5, v1, v2, src_scc
// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0xf6,0x03]
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1]
+// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04]
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1]
+// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04]
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1]
+// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04]
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04]
+
v_min3_f32 v5, v1, v2, v3
// CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04]
diff --git a/llvm/test/MC/AVR/inst-brbc.s b/llvm/test/MC/AVR/inst-brbc.s
index 6d96393..bf73188 100644
--- a/llvm/test/MC/AVR/inst-brbc.s
+++ b/llvm/test/MC/AVR/inst-brbc.s
@@ -15,8 +15,10 @@ foo:
; CHECK: brcc .Ltmp1-16+2 ; encoding: [0bAAAAA000,0b111101AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 23 f4 brvc .+8
-; INST-NEXT: c0 f7 brsh .-16
+; INST-NEXT: fb f7 brvc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xa
+; INST-NEXT: f8 f7 brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0xc
; INST-NEXT: 59 f7 brne .-42
; INST-NEXT: 52 f7 brpl .-44
; INST-NEXT: 4c f7 brge .-46
diff --git a/llvm/test/MC/AVR/inst-brbs.s b/llvm/test/MC/AVR/inst-brbs.s
index 9dde5e1..3e64ebc 100644
--- a/llvm/test/MC/AVR/inst-brbs.s
+++ b/llvm/test/MC/AVR/inst-brbs.s
@@ -14,8 +14,10 @@ foo:
; CHECK: brcs .Ltmp1-12+2 ; encoding: [0bAAAAA000,0b111100AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 23 f0 brvs .+8
-; INST-NEXT: d0 f3 brlo .-12
+; INST-NEXT: fb f3 brvs .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xa
+; INST-NEXT: f8 f3 brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x8
; INST-NEXT: 59 f3 breq .-42
; INST-NEXT: 52 f3 brmi .-44
; INST-NEXT: 4c f3 brlt .-46
diff --git a/llvm/test/MC/AVR/inst-brcc.s b/llvm/test/MC/AVR/inst-brcc.s
index 0edefa1..eba05e0 100644
--- a/llvm/test/MC/AVR/inst-brcc.s
+++ b/llvm/test/MC/AVR/inst-brcc.s
@@ -18,7 +18,11 @@ bar:
; CHECK: brcc bar ; encoding: [0bAAAAA000,0b111101AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 08 f5 brsh .+66
-; INST-NEXT: a8 f7 brsh .-22
-; INST-NEXT: 08 f5 brsh .+66
-; INST-NEXT: 00 f4 brsh .+0
+; INST-NEXT: f8 f7 brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x44
+; INST-NEXT: f8 f7 brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x12
+; INST-NEXT: f8 f7 brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x48
+; INST-NEXT: f8 f7 brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x8
diff --git a/llvm/test/MC/AVR/inst-brcs.s b/llvm/test/MC/AVR/inst-brcs.s
index ea8a3f5..fb4e0dd 100644
--- a/llvm/test/MC/AVR/inst-brcs.s
+++ b/llvm/test/MC/AVR/inst-brcs.s
@@ -18,7 +18,11 @@ bar:
; CHECK: brcs bar ; encoding: [0bAAAAA000,0b111100AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 20 f0 brlo .+8
-; INST-NEXT: 10 f0 brlo .+4
-; INST-NEXT: 20 f0 brlo .+8
-; INST-NEXT: 00 f0 brlo .+0
+; INST-NEXT: f8 f3 brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xa
+; INST-NEXT: f8 f3 brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x8
+; INST-NEXT: f8 f3 brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xe
+; INST-NEXT: f8 f3 brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x8
diff --git a/llvm/test/MC/AVR/inst-breq.s b/llvm/test/MC/AVR/inst-breq.s
index d916f6d..8b8e85a 100644
--- a/llvm/test/MC/AVR/inst-breq.s
+++ b/llvm/test/MC/AVR/inst-breq.s
@@ -18,7 +18,10 @@ bar:
; CHECK: brbs 1, bar ; encoding: [0bAAAAA001,0b111100AA]
; INST-LABEL: <foo>:
-; INST-NEXT: b9 f3 breq .-18
-; INST-NEXT: d1 f3 breq .-12
-; INST-NEXT: b9 f3 breq .-18
-; INST-NEXT: 01 f0 breq .+0
+; INST-NEXT: f9 f3 breq .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x10
+; INST-NEXT: f9 f3 breq .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x8
+; INST-NEXT: f9 f3 breq .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0xc
+; INST-NEXT: f9 f3 breq .-2
diff --git a/llvm/test/MC/AVR/inst-brge.s b/llvm/test/MC/AVR/inst-brge.s
index 3a8fd72..ed96d89 100644
--- a/llvm/test/MC/AVR/inst-brge.s
+++ b/llvm/test/MC/AVR/inst-brge.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brge bar ; encoding: [0bAAAAA100,0b111101AA]
; INST-LABEL: <foo>:
-; INST-NEXT: cc f4 brge .+50
-; INST-NEXT: ac f4 brge .+42
-; INST-NEXT: 04 f4 brge .+0
+; INST-NEXT: fc f7 brge .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x34
+; INST-NEXT: fc f7 brge .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x2e
+; INST-NEXT: fc f7 brge .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brhc.s b/llvm/test/MC/AVR/inst-brhc.s
index 4fc55b6..8421c91 100644
--- a/llvm/test/MC/AVR/inst-brhc.s
+++ b/llvm/test/MC/AVR/inst-brhc.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brhc bar ; encoding: [0bAAAAA101,0b111101AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 35 f4 brhc .+12
-; INST-NEXT: 3d f4 brhc .+14
-; INST-NEXT: 05 f4 brhc .+0
+; INST-NEXT: fd f7 brhc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xe
+; INST-NEXT: fd f7 brhc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x12
+; INST-NEXT: fd f7 brhc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brhs.s b/llvm/test/MC/AVR/inst-brhs.s
index d0968753..a3777b4 100644
--- a/llvm/test/MC/AVR/inst-brhs.s
+++ b/llvm/test/MC/AVR/inst-brhs.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brhs bar ; encoding: [0bAAAAA101,0b111100AA]
; INST-LABEL: <foo>:
-; INST-NEXT: fd f2 brhs .-66
-; INST-NEXT: 3d f0 brhs .+14
-; INST-NEXT: 05 f0 brhs .+0
+; INST-NEXT: fd f3 brhs .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x40
+; INST-NEXT: fd f3 brhs .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x12
+; INST-NEXT: fd f3 brhs .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brid.s b/llvm/test/MC/AVR/inst-brid.s
index 2a3a30f..888ae02 100644
--- a/llvm/test/MC/AVR/inst-brid.s
+++ b/llvm/test/MC/AVR/inst-brid.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brid bar ; encoding: [0bAAAAA111,0b111101AA]
; INST-LABEL: <foo>:
-; INST-NEXT: af f4 brid .+42
-; INST-NEXT: ff f4 brid .+62
-; INST-NEXT: 07 f4 brid .+0
+; INST-NEXT: ff f7 brid .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x2c
+; INST-NEXT: ff f7 brid .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x42
+; INST-NEXT: ff f7 brid .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brie.s b/llvm/test/MC/AVR/inst-brie.s
index 4f867ae..1d175f1 100644
--- a/llvm/test/MC/AVR/inst-brie.s
+++ b/llvm/test/MC/AVR/inst-brie.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brie bar ; encoding: [0bAAAAA111,0b111100AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 57 f0 brie .+20
-; INST-NEXT: a7 f0 brie .+40
-; INST-NEXT: 07 f0 brie .+0
+; INST-NEXT: ff f3 brie .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x16
+; INST-NEXT: ff f3 brie .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x2c
+; INST-NEXT: ff f3 brie .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brlo.s b/llvm/test/MC/AVR/inst-brlo.s
index 48499aa..4b57e77 100644
--- a/llvm/test/MC/AVR/inst-brlo.s
+++ b/llvm/test/MC/AVR/inst-brlo.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brlo bar ; encoding: [0bAAAAA000,0b111100AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 30 f0 brlo .+12
-; INST-NEXT: 70 f0 brlo .+28
-; INST-NEXT: 00 f0 brlo .+0
+; INST-NEXT: f8 f3 brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xe
+; INST-NEXT: f8 f3 brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x20
+; INST-NEXT: f8 f3 brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brlt.s b/llvm/test/MC/AVR/inst-brlt.s
index e16fd05..58e57c4d 100644
--- a/llvm/test/MC/AVR/inst-brlt.s
+++ b/llvm/test/MC/AVR/inst-brlt.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brlt bar ; encoding: [0bAAAAA100,0b111100AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 44 f0 brlt .+16
-; INST-NEXT: 0c f0 brlt .+2
-; INST-NEXT: 04 f0 brlt .+0
+; INST-NEXT: fc f3 brlt .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x12
+; INST-NEXT: fc f3 brlt .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
+; INST-NEXT: fc f3 brlt .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brmi.s b/llvm/test/MC/AVR/inst-brmi.s
index 0d46af8..c406448 100644
--- a/llvm/test/MC/AVR/inst-brmi.s
+++ b/llvm/test/MC/AVR/inst-brmi.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brmi bar ; encoding: [0bAAAAA010,0b111100AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 0a f1 brmi .+66
-; INST-NEXT: ea f0 brmi .+58
-; INST-NEXT: 02 f0 brmi .+0
+; INST-NEXT: fa f3 brmi .-2
+; INST-NEXT: VR_7_PCREL .text+0x44
+; INST-NEXT: fa f3 brmi .-2
+; INST-NEXT: VR_7_PCREL .text+0x3e
+; INST-NEXT: fa f3 brmi .-2
+; INST-NEXT: VR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brne.s b/llvm/test/MC/AVR/inst-brne.s
index e87813a..4b00c63 100644
--- a/llvm/test/MC/AVR/inst-brne.s
+++ b/llvm/test/MC/AVR/inst-brne.s
@@ -18,7 +18,10 @@ bar:
; CHECK: brbc 1, bar ; encoding: [0bAAAAA001,0b111101AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 29 f4 brne .+10
-; INST-NEXT: 09 f4 brne .+2
-; INST-NEXT: 29 f4 brne .+10
-; INST-NEXT: 01 f4 brne .+0
+; INST-NEXT: f9 f7 brne .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xc
+; INST-NEXT: f9 f7 brne .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
+; INST-NEXT: f9 f7 brne .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x10
+; INST-NEXT: f9 f7 brne .-2
diff --git a/llvm/test/MC/AVR/inst-brpl.s b/llvm/test/MC/AVR/inst-brpl.s
index 3487796..9049e24 100644
--- a/llvm/test/MC/AVR/inst-brpl.s
+++ b/llvm/test/MC/AVR/inst-brpl.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brpl bar ; encoding: [0bAAAAA010,0b111101AA]
; INST-LABEL: <foo>:
-; INST-NEXT: d2 f7 brpl .-12
-; INST-NEXT: 4a f4 brpl .+18
-; INST-NEXT: 02 f4 brpl .+0
+; INST-NEXT: fa f7 brpl .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0xa
+; INST-NEXT: fa f7 brpl .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x16
+; INST-NEXT: fa f7 brpl .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brsh.s b/llvm/test/MC/AVR/inst-brsh.s
index be0a06c..0f32fba 100644
--- a/llvm/test/MC/AVR/inst-brsh.s
+++ b/llvm/test/MC/AVR/inst-brsh.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brsh bar ; encoding: [0bAAAAA000,0b111101AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 80 f4 brsh .+32
-; INST-NEXT: 18 f5 brsh .+70
-; INST-NEXT: 00 f4 brsh .+0
+; INST-NEXT: f8 f7 brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x22
+; INST-NEXT: f8 f7 brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x4a
+; INST-NEXT: f8 f7 brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brtc.s b/llvm/test/MC/AVR/inst-brtc.s
index 312c55c..731b495 100644
--- a/llvm/test/MC/AVR/inst-brtc.s
+++ b/llvm/test/MC/AVR/inst-brtc.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brtc bar ; encoding: [0bAAAAA110,0b111101AA]
; INST-LABEL: <foo>:
-; INST-NEXT: d6 f4 brtc .+52
-; INST-NEXT: ce f4 brtc .+50
-; INST-NEXT: 06 f4 brtc .+0
+; INST-NEXT: fe f7 brtc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x36
+; INST-NEXT: fe f7 brtc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x36
+; INST-NEXT: fe f7 brtc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brts.s b/llvm/test/MC/AVR/inst-brts.s
index 40ef6af..bb00acb 100644
--- a/llvm/test/MC/AVR/inst-brts.s
+++ b/llvm/test/MC/AVR/inst-brts.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brts bar ; encoding: [0bAAAAA110,0b111100AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 4e f0 brts .+18
-; INST-NEXT: 5e f0 brts .+22
-; INST-NEXT: 06 f0 brts .+0
+; INST-NEXT: fe f3 brts .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x14
+; INST-NEXT: fe f3 brts .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x1a
+; INST-NEXT: fe f3 brts .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brvc.s b/llvm/test/MC/AVR/inst-brvc.s
index d493ff1..f65e735 100644
--- a/llvm/test/MC/AVR/inst-brvc.s
+++ b/llvm/test/MC/AVR/inst-brvc.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brvc bar ; encoding: [0bAAAAA011,0b111101AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 93 f7 brvc .-28
-; INST-NEXT: 0b f7 brvc .-62
-; INST-NEXT: 03 f4 brvc .+0
+; INST-NEXT: fb f7 brvc .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x1a
+; INST-NEXT: fb f7 brvc .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x3a
+; INST-NEXT: fb f7 brvc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brvs.s b/llvm/test/MC/AVR/inst-brvs.s
index 07755d8..a5b7e4b 100644
--- a/llvm/test/MC/AVR/inst-brvs.s
+++ b/llvm/test/MC/AVR/inst-brvs.s
@@ -16,6 +16,9 @@ bar:
; CHECK: brvs bar ; encoding: [0bAAAAA011,0b111100AA]
; INST-LABEL: <foo>:
-; INST-NEXT: 4b f0 brvs .+18
-; INST-NEXT: 83 f0 brvs .+32
-; INST-NEXT: 03 f0 brvs .+0
+; INST-NEXT: fb f3 brvs .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x14
+; INST-NEXT: fb f3 brvs .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x24
+; INST-NEXT: fb f3 brvs .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-rcall.s b/llvm/test/MC/AVR/inst-rcall.s
index 1da6e7f..f7818aa 100644
--- a/llvm/test/MC/AVR/inst-rcall.s
+++ b/llvm/test/MC/AVR/inst-rcall.s
@@ -17,8 +17,11 @@ foo:
; CHECK: rcall .Ltmp3+46+2 ; encoding: [A,0b1101AAAA]
; INST-LABEL: <foo>:
-; INST-NEXT: 00 d0 rcall .+0
-; INST-NEXT: fc df rcall .-8
-; INST-NEXT: 06 d0 rcall .+12
-; INST-NEXT: 17 d0 rcall .+46
-; INST-NEXT: ea df rcall .-44
+; INST-NEXT: ff df rcall .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x2
+; INST-NEXT: ff df rcall .-2
+; INST-NEXT: R_AVR_13_PCREL .text-0x4
+; INST-NEXT: ff df rcall .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x12
+; INST-NEXT: ff df rcall .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x36
diff --git a/llvm/test/MC/AVR/inst-rjmp.s b/llvm/test/MC/AVR/inst-rjmp.s
index 6712319..6ac6343 100644
--- a/llvm/test/MC/AVR/inst-rjmp.s
+++ b/llvm/test/MC/AVR/inst-rjmp.s
@@ -33,18 +33,28 @@ x:
; CHECK: rjmp .Ltmp6+4094+2 ; encoding: [A,0b1100AAAA]
; INST-LABEL: <foo>:
-; INST-NEXT: 01 c0 rjmp .+2
; INST-NEXT: ff cf rjmp .-2
-; INST-NEXT: fd cf rjmp .-6
-; INST-NEXT: 04 c0 rjmp .+8
-; INST-NEXT: 01 c0 rjmp .+2
-; INST-NEXT: 00 c0 rjmp .+0
+; INST-NEXT: R_AVR_13_PCREL .text+0x4
+; INST-NEXT: ff cf rjmp .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x2
+; INST-NEXT: ff cf rjmp .-2
+; INST-NEXT: R_AVR_13_PCREL .text
+; INST-NEXT: ff cf rjmp .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x10
+; INST-NEXT: ff cf rjmp .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0xc
+; INST-NEXT: ff cf rjmp .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0xc
; INST-EMPTY:
; INST-LABEL: <end>:
-; INST-NEXT: fe cf rjmp .-4
-; INST-NEXT: fd cf rjmp .-6
+; INST-NEXT: ff cf rjmp .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0xa
+; INST-NEXT: ff cf rjmp .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0xa
; INST-EMPTY:
; INST-LABEL: <x>:
; INST-NEXT: ff cf rjmp .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x10
; INST-NEXT: 0f c0 rjmp .+30
-; INST-NEXT: ff c7 rjmp .+4094
+; INST-NEXT: ff cf rjmp .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x1014
diff --git a/llvm/test/MC/COFF/bss-text.s b/llvm/test/MC/COFF/bss-text.s
index ed68905..cedbb2f 100644
--- a/llvm/test/MC/COFF/bss-text.s
+++ b/llvm/test/MC/COFF/bss-text.s
@@ -1,13 +1,15 @@
-# RUN: not llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o /dev/null 2>&1 | FileCheck %s
+# RUN: not llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error:
## -filetype=asm does not check the error.
# RUN: llvm-mc -triple=x86_64-pc-win32 %s
+.bss
+# CHECK: <unknown>:0: error: BSS section '.bss' cannot have non-zero bytes
+ addb %bl,(%rax)
+
.section uninitialized,"b"
-# MCRelaxableFragment
-# CHECK: {{.*}}.s:[[#@LINE+1]]:3: error: IMAGE_SCN_CNT_UNINITIALIZED_DATA section 'uninitialized' cannot have instructions
+# CHECK: <unknown>:0: error: BSS section 'uninitialized' cannot have non-zero bytes
jmp foo
-.bss
-# CHECK: {{.*}}.s:[[#@LINE+1]]:3: error: IMAGE_SCN_CNT_UNINITIALIZED_DATA section '.bss' cannot have instructions
+.section bss0,"b"
addb %al,(%rax)
diff --git a/llvm/test/MC/COFF/section.s b/llvm/test/MC/COFF/section.s
index 9c1a11e..fdd6570 100644
--- a/llvm/test/MC/COFF/section.s
+++ b/llvm/test/MC/COFF/section.s
@@ -29,7 +29,7 @@
.section s ; .long 1
.section s_, "" ; .long 1
.section s_a,"a"; .long 1
-.section s_b,"b"; .long 1
+.section s_b,"b"; .long 0
.section s_d,"d"; .long 1
.section s_D,"D"; .long 1
.section s_n,"n"; .long 1
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt
index 721babd..08ed50d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt
@@ -1146,6 +1146,18 @@
# GFX10: v_alignbit_b32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x4e,0xd5,0x6a,0x04,0x0e,0x04]
0x05,0x00,0x4e,0xd5,0x6a,0x04,0x0e,0x04
+# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04]
+0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04
+
+# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04]
+0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04
+
+# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04]
+0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04
+
+# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04]
+0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04
+
# GFX10: v_alignbyte_b32 v255, v1, v2, v3 ; encoding: [0xff,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04]
0xff,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04
@@ -1233,6 +1245,18 @@
# GFX10: v_alignbyte_b32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x4f,0xd5,0x6a,0x04,0x0e,0x04]
0x05,0x00,0x4f,0xd5,0x6a,0x04,0x0e,0x04
+# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04]
+0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04
+
+# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04]
+0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04
+
+# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04]
+0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04
+
+# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04]
+0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04
+
# GFX10: v_and_b32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x1b,0xd5,0x01,0x05,0x02,0x00]
0xff,0x00,0x1b,0xd5,0x01,0x05,0x02,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt
index 4bd9ab4..92fa802 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt
@@ -5,3 +5,15 @@
# GFX1250: s_load_b32 s4, s[2:3], 0xa nv ; encoding: [0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf8]
0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf8
+
+# GFX1250: s_load_b32 s4, s[2:3], 0xa scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x0a,0x00,0x00,0xf9]
+0x01,0x01,0x00,0xf4,0x0a,0x00,0x00,0xf9
+
+# GFX1250: s_load_b32 s4, s[2:3], 0xa scale_offset nv ; encoding: [0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf9]
+0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf9
+
+# GFX1250: s_load_b32 s4, s[2:3], m0 offset:0x20 scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0xfb]
+0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0xfb
+
+# GFX1250: s_load_b32 s4, s[2:3], s5 offset:0x20 scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0x0b]
+0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0x0b
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
index fcbb58b..de7895f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
@@ -1,104 +1,272 @@
# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+# GFX1250: flat_atomic_add_f32 v0, v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+0x02,0x80,0x15,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a
+
# GFX1250: flat_atomic_add_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_add_u32 v0, v1, v2, s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0xc0,0xff,0xff]
+0x02,0x40,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0xc0,0xff,0xff
+
# GFX1250: flat_atomic_add_u32 v2, v3, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff]
0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff
# GFX1250: flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_add_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+0x02,0xc0,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_and_b32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x00,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_and_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
# GFX1250: flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_and_b64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+0x02,0x40,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cmpswap_b32 v0, v2, v[2:3], s[2:3] scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x11,0x01,0x02,0x00,0x00,0x00]
+0x02,0x00,0x0d,0xec,0x00,0x00,0x11,0x01,0x02,0x00,0x00,0x00
+
# GFX1250: flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
# GFX1250: flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3] ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00]
0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00
+# GFX1250: flat_atomic_cmpswap_b64 v[0:1], v2, v[2:5], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cond_sub_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x00,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_dec_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x00,0x10,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_dec_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
# GFX1250: flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_dec_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+0x02,0x40,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_inc_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0xc0,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_inc_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
# GFX1250: flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_inc_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x80,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_max_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
# GFX1250: flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_max_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_num_f32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x80,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_max_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0xc0,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_max_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
# GFX1250: flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_max_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x00,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_min_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
# GFX1250: flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_min_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_num_f32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x40,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_min_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x40,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_min_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
# GFX1250: flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_min_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x40,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_or_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
# GFX1250: flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_or_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_bf16 v0, v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+0x02,0x80,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a
+
# GFX1250: flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_pk_add_f16 v0, v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+0x02,0x40,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a
+
# GFX1250: flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_sub_clamp_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0xc0,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_sub_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x80,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
# GFX1250: flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_sub_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b32 v0, v0, v2, s[2:3] scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x11,0x01,0x00,0x00,0x00,0x00]
+0x02,0xc0,0x0c,0xec,0x00,0x00,0x11,0x01,0x00,0x00,0x00,0x00
+
# GFX1250: flat_atomic_swap_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
# GFX1250: flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_swap_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x80,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
# GFX1250: flat_atomic_xor_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
# GFX1250: flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+# GFX1250: flat_atomic_xor_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_b128 v[2:5], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_b32 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x05,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x00,0x05,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_b64 v[2:3], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x40,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_b96 v[2:4], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_b16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x00,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_b16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x40,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_i16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x40,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_u16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x00,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_prefetch_b8 v3, s[2:3] ; encoding: [0x02,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: flat_store_b128 v2, v[2:5], s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b16 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b32 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b64 v2, v[2:3], s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b8 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b96 v2, v[2:4], s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b16 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b8 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
# GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
@@ -2856,6 +3024,159 @@
# GFX1250: scratch_load_b32 v5, v2, off nv ; encoding: [0xfc,0x00,0x05,0xed,0x05,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
0xfc,0x00,0x05,0xed,0x05,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+# GFX1250: global_load_b32 v5, v1, s[2:3] offset:32 scale_offset ; encoding: [0x02,0x00,0x05,0xee,0x05,0x00,0x01,0x00,0x01,0x20,0x00,0x00]
+0x02,0x00,0x05,0xee,0x05,0x00,0x01,0x00,0x01,0x20,0x00,0x00
+
+# GFX1250: global_store_b32 v5, v1, s[2:3] offset:32 scale_offset ; encoding: [0x02,0x80,0x06,0xee,0x00,0x00,0x81,0x00,0x05,0x20,0x00,0x00]
+0x02,0x80,0x06,0xee,0x00,0x00,0x81,0x00,0x05,0x20,0x00,0x00
+
+# GFX1250: global_atomic_add_u32 v2, v5, s[2:3] scale_offset ; encoding: [0x02,0x40,0x0d,0xee,0x00,0x00,0x81,0x02,0x02,0x00,0x00,0x00]
+0x02,0x40,0x0d,0xee,0x00,0x00,0x81,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v5, v2, off scale_offset ; encoding: [0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v5, v2, off offset:32 scale_offset ; encoding: [0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00]
+0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00
+
+# GFX1250: scratch_load_b32 v5, v2, s1 offset:32 scale_offset ; encoding: [0x01,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00]
+0x01,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00
+
+# GFX1250: scratch_store_b32 v2, v5, off scale_offset ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b32 v2, v5, s1 scale_offset ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00]
+0x01,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_prefetch_b8 v[2:3] ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_prefetch_b8 v[2:3] offset:-1024 ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0xfc,0xff]
+0x7c,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0xfc,0xff
+
+# GFX1250: flat_prefetch_b8 v[2:3] offset:-1024 th:TH_LOAD_NT scope:SCOPE_SE ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x14,0x00,0x02,0x00,0xfc,0xff]
+0x7c,0x40,0x17,0xec,0x00,0x00,0x14,0x00,0x02,0x00,0xfc,0xff
+
+# GFX1250: flat_prefetch_b8 v[2:3] offset:-64 th:TH_LOAD_RT_NT scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x5c,0x00,0x02,0xc0,0xff,0xff]
+0x7c,0x40,0x17,0xec,0x00,0x00,0x5c,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_prefetch_b8 v[2:3] offset:1024 ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x04,0x00]
+0x7c,0x40,0x17,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x04,0x00
+
+# GFX1250: flat_prefetch_b8 v[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x40,0x17,0xec,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_prefetch_b8 v[2:3] th:TH_LOAD_HT ; encoding: [0x7c,0x40,0x17,0xec,0x00,0x00,0x20,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x40,0x17,0xec,0x00,0x00,0x20,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_prefetch_b8 v4, s[2:3] offset:-1024 th:TH_LOAD_NT scope:SCOPE_DEV ; encoding: [0x02,0x40,0x17,0xee,0x00,0x00,0x18,0x00,0x04,0x00,0xfc,0xff]
+0x02,0x40,0x17,0xee,0x00,0x00,0x18,0x00,0x04,0x00,0xfc,0xff
+
+# GFX1250: global_prefetch_b8 v4, s[2:3] th:TH_LOAD_RT_NT ; encoding: [0x02,0x40,0x17,0xee,0x00,0x00,0x50,0x00,0x04,0x00,0x00,0x00]
+0x02,0x40,0x17,0xee,0x00,0x00,0x50,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_prefetch_b8 v[2:3], off offset:-1024 th:TH_LOAD_HT scope:SCOPE_SE ; encoding: [0x7c,0x40,0x17,0xee,0x00,0x00,0x24,0x00,0x02,0x00,0xfc,0xff]
+0x7c,0x40,0x17,0xee,0x00,0x00,0x24,0x00,0x02,0x00,0xfc,0xff
+
+# GFX1250: global_prefetch_b8 v[2:3], off offset:64 th:TH_LOAD_NT_RT scope:SCOPE_DEV ; encoding: [0x7c,0x40,0x17,0xee,0x00,0x00,0x48,0x00,0x02,0x40,0x00,0x00]
+0x7c,0x40,0x17,0xee,0x00,0x00,0x48,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_prefetch_b8 v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x17,0xee,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x40,0x17,0xee,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_load_monitor_b128 v[0:3], v[4:5] ; encoding: [0x7c,0x80,0x1c,0xec,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x1c,0xec,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_monitor_b128 v[0:3], v[4:5] offset:64 ; encoding: [0x7c,0x80,0x1c,0xec,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
+0x7c,0x80,0x1c,0xec,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00
+
+# GFX1250: flat_load_monitor_b128 v[0:3], v[4:5] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x80,0x1c,0xec,0x00,0x00,0x3c,0x00,0x04,0xc0,0xff,0xff]
+0x7c,0x80,0x1c,0xec,0x00,0x00,0x3c,0x00,0x04,0xc0,0xff,0xff
+
+# GFX1250: flat_load_monitor_b32 v1, v[2:3] ; encoding: [0x7c,0x00,0x1c,0xec,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x1c,0xec,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_load_monitor_b32 v1, v[2:3] offset:64 ; encoding: [0x7c,0x00,0x1c,0xec,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x7c,0x00,0x1c,0xec,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_monitor_b32 v1, v[2:3] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x1c,0xec,0x01,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff]
+0x7c,0x00,0x1c,0xec,0x01,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_load_monitor_b64 v[0:1], v[2:3] ; encoding: [0x7c,0x40,0x1c,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x40,0x1c,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_load_monitor_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x1c,0xec,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x7c,0x40,0x1c,0xec,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_monitor_b64 v[0:1], v[2:3] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x1c,0xec,0x00,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff]
+0x7c,0x40,0x1c,0xec,0x00,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_load_monitor_b32 v1, v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x00,0x1c,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x04,0x00,0x1c,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x40,0x1c,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x04,0x40,0x1c,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_monitor_b128 v[0:3], v[4:5], off ; encoding: [0x7c,0x80,0x1c,0xee,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x1c,0xee,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_monitor_b128 v[0:3], v[4:5], off offset:64 ; encoding: [0x7c,0x80,0x1c,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
+0x7c,0x80,0x1c,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00
+
+# GFX1250: global_load_monitor_b128 v[0:3], v[4:5], off offset:-64 th:TH_LOAD_NT ; encoding: [0x7c,0x80,0x1c,0xee,0x00,0x00,0x10,0x00,0x04,0xc0,0xff,0xff]
+0x7c,0x80,0x1c,0xee,0x00,0x00,0x10,0x00,0x04,0xc0,0xff,0xff
+
+# GFX1250: global_load_monitor_b128 v[0:3], v4, s[0:1] ; encoding: [0x00,0x80,0x1c,0xee,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x00,0x80,0x1c,0xee,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_monitor_b128 v[0:3], v4, s[0:1] offset:64 ; encoding: [0x00,0x80,0x1c,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
+0x00,0x80,0x1c,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00
+
+# GFX1250: global_load_monitor_b128 v[0:3], v4, s[0:1] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x00,0x80,0x1c,0xee,0x00,0x00,0x3c,0x00,0x04,0xc0,0xff,0xff]
+0x00,0x80,0x1c,0xee,0x00,0x00,0x3c,0x00,0x04,0xc0,0xff,0xff
+
+# GFX1250: global_load_monitor_b32 v1, v[2:3], off ; encoding: [0x7c,0x00,0x1c,0xee,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x1c,0xee,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_monitor_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x1c,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x7c,0x00,0x1c,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_monitor_b32 v1, v[2:3], off offset:-64 th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x7c,0x00,0x1c,0xee,0x01,0x00,0x68,0x00,0x02,0xc0,0xff,0xff]
+0x7c,0x00,0x1c,0xee,0x01,0x00,0x68,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_load_monitor_b32 v1, v2, s[0:1] ; encoding: [0x00,0x00,0x1c,0xee,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+0x00,0x00,0x1c,0xee,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_monitor_b32 v1, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x1c,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x00,0x00,0x1c,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_monitor_b32 v1, v2, s[0:1] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x00,0x00,0x1c,0xee,0x01,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff]
+0x00,0x00,0x1c,0xee,0x01,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_load_monitor_b64 v[0:1], v[2:3], off ; encoding: [0x7c,0x40,0x1c,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x40,0x1c,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_monitor_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x1c,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x7c,0x40,0x1c,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_monitor_b64 v[0:1], v[2:3], off offset:-64 th:TH_LOAD_HT scope:SCOPE_SE ; encoding: [0x7c,0x40,0x1c,0xee,0x00,0x00,0x24,0x00,0x02,0xc0,0xff,0xff]
+0x7c,0x40,0x1c,0xee,0x00,0x00,0x24,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_load_monitor_b64 v[0:1], v2, s[0:1] ; encoding: [0x00,0x40,0x1c,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+0x00,0x40,0x1c,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_monitor_b64 v[0:1], v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x1c,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x00,0x40,0x1c,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_monitor_b64 v[0:1], v2, s[0:1] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x00,0x40,0x1c,0xee,0x00,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff]
+0x00,0x40,0x1c,0xee,0x00,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_load_monitor_b32 v1, v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x00,0x1c,0xee,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x04,0x00,0x1c,0xee,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x40,0x1c,0xee,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x04,0x40,0x1c,0xee,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
# GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
index c1213f2..130941c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
@@ -112,6 +112,264 @@
0x04,0x03,0x17,0xd5,0x02,0x11,0x02,0x00
# GFX1250: v_fmac_f64_e64 v[4:5], |v[2:3]|, |v[8:9]| ; encoding: [0x04,0x03,0x17,0xd5,0x02,0x11,0x02,0x00]
+0x02,0x09,0xfc,0x51
+# GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x51]
+
+0x02,0x11,0xfc,0x51
+# GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x51]
+
+0xc1,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x50]
+
+0xc1,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x50]
+
+0xf7,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x50]
+
+0xf7,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x50]
+
+0x80,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x50]
+
+0x80,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x50]
+
+0xf0,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x50]
+
+0xf0,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x50]
+
+0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f
+# GFX1250: v_add_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f]
+
+0xff,0x08,0x08,0x50,0x56,0x34,0x12,0xaf
+# GFX1250: v_add_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+
+0x7e,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x50]
+
+0x7e,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x50]
+
+0xfe,0x09,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x50]
+
+0xfe,0x11,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x50]
+
+0x02,0xfd,0x09,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x50]
+
+0x02,0x09,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x50]
+
+0x02,0x11,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x50]
+
+0x6a,0x08,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x50]
+
+0x6a,0x10,0x08,0x50
+# GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x50]
+
+0x04,0x00,0x28,0xd5,0x02,0x08,0x00,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x08,0x00,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0x83,0x01,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x83,0x01,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0xef,0x01,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xef,0x01,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0x01,0x01,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x01,0x01,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0xe1,0x01,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xe1,0x01,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0xfd,0x00,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xfd,0x00,0x00]
+
+0x04,0x80,0x28,0xd5,0x02,0x11,0x02,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x28,0xd5,0x02,0x11,0x02,0x00]
+
+0x04,0x00,0x28,0xd5,0x02,0xd5,0x00,0x00
+# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xd5,0x00,0x00]
+
+0x02,0x09,0xfc,0x53
+# GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x53]
+
+0x02,0x11,0xfc,0x53
+# GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x53]
+
+0xc1,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x52]
+
+0xc1,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x52]
+
+0xf7,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x52]
+
+0xf7,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x52]
+
+0x80,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x52]
+
+0x80,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x52]
+
+0xf0,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x52]
+
+0xf0,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x52]
+
+0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f
+# GFX1250: v_sub_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f]
+
+0xff,0x08,0x08,0x52,0x56,0x34,0x12,0xaf
+# GFX1250: v_sub_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+
+0x7e,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x52]
+
+0x7e,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x52]
+
+0xfe,0x09,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x52]
+
+0xfe,0x11,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x52]
+
+0x02,0xfd,0x09,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x52]
+
+0x02,0x09,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x52]
+
+0x02,0x11,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x52]
+
+0x6a,0x08,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x52]
+
+0x6a,0x10,0x08,0x52
+# GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x52]
+
+0x04,0x00,0x29,0xd5,0x02,0x08,0x00,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x08,0x00,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0x83,0x01,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x83,0x01,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0xef,0x01,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xef,0x01,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0x01,0x01,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x01,0x01,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0xe1,0x01,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xe1,0x01,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0xfd,0x00,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xfd,0x00,0x00]
+
+0x04,0x80,0x29,0xd5,0x02,0x11,0x02,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x29,0xd5,0x02,0x11,0x02,0x00]
+
+0x04,0x00,0x29,0xd5,0x02,0xd5,0x00,0x00
+# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xd5,0x00,0x00]
+
+0x02,0x09,0xfc,0x55
+# GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x55]
+
+0x02,0x11,0xfc,0x55
+# GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x55]
+
+0xc1,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x54]
+
+0xc1,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x54]
+
+0xf7,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x54]
+
+0xf7,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x54]
+
+0x80,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x54]
+
+0x80,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x54]
+
+0xf0,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x54]
+
+0xf0,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x54]
+
+0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f
+# GFX1250: v_mul_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f]
+
+0xff,0x08,0x08,0x54,0x56,0x34,0x12,0xaf
+# GFX1250: v_mul_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+
+0x7e,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x54]
+
+0x7e,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x54]
+
+0xfe,0x09,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x54]
+
+0xfe,0x11,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x54]
+
+0x02,0xfd,0x09,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x54]
+
+0x02,0x09,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x54]
+
+0x02,0x11,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x54]
+
+0x6a,0x08,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x54]
+
+0x6a,0x10,0x08,0x54
+# GFX1250: v_mul_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x54]
+
+0x04,0x00,0x2a,0xd5,0x02,0x08,0x00,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x08,0x00,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0x83,0x01,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x83,0x01,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0xef,0x01,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xef,0x01,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0x01,0x01,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x01,0x01,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0xe1,0x01,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xe1,0x01,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0xfd,0x00,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xfd,0x00,0x00]
+
+0x04,0x00,0x2a,0xd5,0x02,0xd5,0x00,0x00
+# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xd5,0x00,0x00]
+
0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
# GFX1250: v_fmaak_f64 v[254:255], 0x405ec000, v[254:255], 0x405ec000 ; encoding: [0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
index d9d8f60..a1a1d0c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
@@ -16,6 +16,52 @@
0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00
# GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x6d,0xd7,0xc1,0xfe,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x6d,0xd7,0xc1,0xfe,0x00,0x00]
+
+0x05,0x00,0x6d,0xd7,0xf0,0xfa,0x00,0x08
+# GFX1250: v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2 ; encoding: [0x05,0x00,0x6d,0xd7,0xf0,0xfa,0x00,0x08]
+
+0x05,0x00,0x6d,0xd7,0x7f,0xf8,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, exec_hi, null ; encoding: [0x05,0x00,0x6d,0xd7,0x7f,0xf8,0x00,0x00]
+
+0x05,0x00,0x6d,0xd7,0x7e,0x82,0x01,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x6d,0xd7,0x7e,0x82,0x01,0x00]
+
+0x05,0x00,0x6d,0xd7,0x7d,0xe0,0x01,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x6d,0xd7,0x7d,0xe0,0x01,0x00]
+
+0x05,0x00,0x6d,0xd7,0x7c,0xfc,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x6d,0xd7,0x7c,0xfc,0x00,0x00]
+
+0x05,0x00,0x6d,0xd7,0x01,0x04,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x04,0x00,0x00]
+
+0x05,0x00,0x6d,0xd7,0x69,0xd2,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x6d,0xd7,0x69,0xd2,0x00,0x00]
+
+0x05,0x00,0x6d,0xd7,0xfd,0xd4,0x00,0x10
+# GFX1250: v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x6d,0xd7,0xfd,0xd4,0x00,0x10]
+
+0x05,0x00,0x6d,0xd7,0x7b,0xfa,0x01,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x6d,0xd7,0x7b,0xfa,0x01,0x00]
+
+0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00]
+
+0x05,0x00,0x6d,0xd7,0xff,0xff,0x03,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x6d,0xd7,0xff,0xff,0x03,0x00]
+
+0x05,0x00,0x6d,0xd7,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6d,0xd7,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x6d,0xd7,0x6a,0xf6,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x6d,0xd7,0x6a,0xf6,0x00,0x00]
+
## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
# GFX1250-FAKE16: {{.*}}
# GFX1250-REAL16: {{.*}}
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp16.txt
new file mode 100644
index 0000000..dec73b7
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp16.txt
@@ -0,0 +1,45 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+0xff,0x81,0x6d,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x6d,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp8.txt
new file mode 100644
index 0000000..db211f90
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp8.txt
@@ -0,0 +1,15 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+0xff,0x81,0x6d,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x6d,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0x6d,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt
new file mode 100644
index 0000000..18246db
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt
@@ -0,0 +1,1033 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+# GFX1250: v_pk_add_f32 v[0:1], v[2:3], 1.0 ; encoding: [0x00,0x40,0x29,0xcc,0x02,0xe5,0x01,0x18]
+0x00,0x40,0x29,0xcc,0x02,0xe5,0x01,0x18
+
+# GFX1250: v_pk_add_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x40,0x29,0xcc,0x08,0x21,0x02,0x18]
+0xfe,0x40,0x29,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], exec, v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x7e,0x20,0x02,0x18]
+0x04,0x40,0x29,0xcc,0x7e,0x20,0x02,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0xfe,0x21,0x02,0x18]
+0x04,0x40,0x29,0xcc,0xfe,0x21,0x02,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], exec ; encoding: [0x04,0x40,0x29,0xcc,0x08,0xfd,0x00,0x18]
+0x04,0x40,0x29,0xcc,0x08,0xfd,0x00,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0xc0,0x29,0xcc,0x08,0x21,0x02,0x18]
+0x04,0xc0,0x29,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x42,0x29,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x42,0x29,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x41,0x29,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x41,0x29,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x43,0x29,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x43,0x29,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x58]
+0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x58
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x38]
+0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x38
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x78]
+0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x78
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x50,0x29,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x50,0x29,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x48,0x29,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x48,0x29,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x58,0x29,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x58,0x29,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x00]
+0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x00
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x10]
+0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x10
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x08]
+0x04,0x40,0x29,0xcc,0x08,0x21,0x02,0x08
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x40,0x29,0xcc,0x08,0xfd,0x03,0x18]
+0x04,0x40,0x29,0xcc,0x08,0xfd,0x03,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], v[8:9], vcc ; encoding: [0x04,0x40,0x29,0xcc,0x08,0xd5,0x00,0x18]
+0x04,0x40,0x29,0xcc,0x08,0xd5,0x00,0x18
+
+# GFX1250: v_pk_add_f32 v[4:5], vcc, v[16:17] ; encoding: [0x04,0x40,0x29,0xcc,0x6a,0x20,0x02,0x18]
+0x04,0x40,0x29,0xcc,0x6a,0x20,0x02,0x18
+
+# GFX1250: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 1.0 ; encoding: [0x00,0x40,0x1f,0xcc,0x02,0x09,0xca,0x1b]
+0x00,0x40,0x1f,0xcc,0x02,0x09,0xca,0x1b
+
+# GFX1250: v_pk_fma_f32 v[0:1], v[4:5], v[8:9], v[16:17] ; encoding: [0x00,0x40,0x1f,0xcc,0x04,0x11,0x42,0x1c]
+0x00,0x40,0x1f,0xcc,0x04,0x11,0x42,0x1c
+
+# GFX1250: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x1f,0xcc,0x00,0x01,0x10,0x04]
+0x08,0x60,0x1f,0xcc,0x00,0x01,0x10,0x04
+
+# GFX1250: v_pk_mul_f32 v[0:1], v[2:3], 1.0 ; encoding: [0x00,0x40,0x28,0xcc,0x02,0xe5,0x01,0x18]
+0x00,0x40,0x28,0xcc,0x02,0xe5,0x01,0x18
+
+# GFX1250: v_pk_mul_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x40,0x28,0xcc,0x08,0x21,0x02,0x18]
+0xfe,0x40,0x28,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], exec, v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x7e,0x20,0x02,0x18]
+0x04,0x40,0x28,0xcc,0x7e,0x20,0x02,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0xfe,0x21,0x02,0x18]
+0x04,0x40,0x28,0xcc,0xfe,0x21,0x02,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], exec ; encoding: [0x04,0x40,0x28,0xcc,0x08,0xfd,0x00,0x18]
+0x04,0x40,0x28,0xcc,0x08,0xfd,0x00,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0xc0,0x28,0xcc,0x08,0x21,0x02,0x18]
+0x04,0xc0,0x28,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x42,0x28,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x42,0x28,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x41,0x28,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x41,0x28,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x43,0x28,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x43,0x28,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x58]
+0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x58
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x38]
+0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x38
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x78]
+0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x78
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x50,0x28,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x50,0x28,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x48,0x28,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x48,0x28,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x58,0x28,0xcc,0x08,0x21,0x02,0x18]
+0x04,0x58,0x28,0xcc,0x08,0x21,0x02,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x00]
+0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x00
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x10]
+0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x10
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x08]
+0x04,0x40,0x28,0xcc,0x08,0x21,0x02,0x08
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x40,0x28,0xcc,0x08,0xfd,0x03,0x18]
+0x04,0x40,0x28,0xcc,0x08,0xfd,0x03,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], v[8:9], vcc ; encoding: [0x04,0x40,0x28,0xcc,0x08,0xd5,0x00,0x18]
+0x04,0x40,0x28,0xcc,0x08,0xd5,0x00,0x18
+
+# GFX1250: v_pk_mul_f32 v[4:5], vcc, v[16:17] ; encoding: [0x04,0x40,0x28,0xcc,0x6a,0x20,0x02,0x18]
+0x04,0x40,0x28,0xcc,0x6a,0x20,0x02,0x18
+
+# GFX1250: v_pk_add_min_i16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x2d,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2d,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_min_i16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x2d,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2d,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_min_i16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x2d,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2d,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_min_i16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x2d,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+0x0a,0xc8,0x2d,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_min_i16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x2d,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0x40,0x2d,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_add_min_i16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x2d,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0xc0,0x2d,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_add_min_i16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x2d,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2d,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_min_i16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x2d,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2d,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x2d,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x40,0x2d,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x2d,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x50,0x2d,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x2d,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x70,0x2d,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x2d,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x48,0x2d,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x2d,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x48,0x2d,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x2d,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x40,0x2d,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_add_min_i16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x2d,0xcc,0x01,0x05,0x0e,0x0c]
+0x0a,0x00,0x2d,0xcc,0x01,0x05,0x0e,0x0c
+
+# GFX1250: v_pk_add_max_i16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x14,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x14,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_max_i16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x14,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x14,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_max_i16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x14,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x14,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_max_i16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x14,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+0x0a,0xc8,0x14,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_max_i16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x14,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0x40,0x14,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_add_max_i16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x14,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0xc0,0x14,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_add_max_i16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x14,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x14,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_max_i16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x14,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x14,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x14,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x40,0x14,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x14,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x50,0x14,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x14,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x70,0x14,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x14,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x48,0x14,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x14,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x48,0x14,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x14,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x40,0x14,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_add_max_i16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x14,0xcc,0x01,0x05,0x0e,0x0c]
+0x0a,0x00,0x14,0xcc,0x01,0x05,0x0e,0x0c
+
+# GFX1250: v_pk_add_min_u16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x2e,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2e,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_min_u16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x2e,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2e,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_min_u16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x2e,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2e,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_min_u16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x2e,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+0x0a,0xc8,0x2e,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_min_u16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x2e,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0x40,0x2e,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_add_min_u16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x2e,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0xc0,0x2e,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_add_min_u16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x2e,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2e,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_min_u16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x2e,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2e,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x2e,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x40,0x2e,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x2e,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x50,0x2e,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x2e,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x70,0x2e,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x2e,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x48,0x2e,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x2e,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x48,0x2e,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x2e,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x40,0x2e,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_add_min_u16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x2e,0xcc,0x01,0x05,0x0e,0x0c]
+0x0a,0x00,0x2e,0xcc,0x01,0x05,0x0e,0x0c
+
+# GFX1250: v_pk_add_max_u16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x15,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x15,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_max_u16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x15,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x15,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_max_u16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x15,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x15,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_max_u16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x15,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+0x0a,0xc8,0x15,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_max_u16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x15,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0x40,0x15,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_add_max_u16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x15,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0xc0,0x15,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_add_max_u16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x15,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x15,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_max_u16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x15,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x15,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x15,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x40,0x15,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x15,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x50,0x15,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x15,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x70,0x15,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x15,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x48,0x15,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x15,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x48,0x15,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x15,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x40,0x15,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_add_max_u16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x15,0xcc,0x01,0x05,0x0e,0x0c]
+0x0a,0x00,0x15,0xcc,0x01,0x05,0x0e,0x0c
+
+# GFX1250: v_pk_min3_i16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x31,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x31,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_min3_i16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x31,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x31,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_min3_i16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x31,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x31,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_min3_i16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x31,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+0x0a,0xc8,0x31,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_min3_i16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x31,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0x40,0x31,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_min3_i16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x31,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0xc0,0x31,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_min3_i16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x31,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x31,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_min3_i16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x31,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x31,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_min3_i16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x31,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x40,0x31,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_min3_i16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x31,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x50,0x31,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_min3_i16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x31,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x70,0x31,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_min3_i16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x31,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x48,0x31,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_min3_i16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x31,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x48,0x31,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_min3_i16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x31,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x40,0x31,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_min3_i16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x31,0xcc,0x01,0x05,0x0e,0x0c]
+0x0a,0x00,0x31,0xcc,0x01,0x05,0x0e,0x0c
+
+# GFX1250: v_pk_max3_i16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x2f,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2f,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_max3_i16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x2f,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2f,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_max3_i16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x2f,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2f,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_max3_i16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x2f,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+0x0a,0xc8,0x2f,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_max3_i16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x2f,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0x40,0x2f,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_max3_i16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x2f,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0xc0,0x2f,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_max3_i16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x2f,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2f,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_max3_i16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x2f,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x2f,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_max3_i16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x2f,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x40,0x2f,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_max3_i16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x2f,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x50,0x2f,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_max3_i16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x2f,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x70,0x2f,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_max3_i16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x2f,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x48,0x2f,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_max3_i16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x2f,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x48,0x2f,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_max3_i16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x2f,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x40,0x2f,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_max3_i16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x2f,0xcc,0x01,0x05,0x0e,0x0c]
+0x0a,0x00,0x2f,0xcc,0x01,0x05,0x0e,0x0c
+
+# GFX1250: v_pk_min3_u16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x32,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x32,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_min3_u16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x32,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x32,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_min3_u16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x32,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x32,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_min3_u16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x32,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+0x0a,0xc8,0x32,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_min3_u16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x32,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0x40,0x32,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_min3_u16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x32,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0xc0,0x32,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_min3_u16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x32,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x32,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_min3_u16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x32,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x32,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_min3_u16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x32,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x40,0x32,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_min3_u16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x32,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x50,0x32,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_min3_u16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x32,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x70,0x32,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_min3_u16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x32,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x48,0x32,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_min3_u16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x32,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x48,0x32,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_min3_u16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x32,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x40,0x32,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_min3_u16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x32,0xcc,0x01,0x05,0x0e,0x0c]
+0x0a,0x00,0x32,0xcc,0x01,0x05,0x0e,0x0c
+
+# GFX1250: v_pk_max3_u16 v10, 0x64, 0x64, 0x64 ; encoding: [0x0a,0x40,0x30,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x30,0xcc,0xff,0xfe,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_max3_u16 v10, 0x64, 0x64, v3 ; encoding: [0x0a,0x40,0x30,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x30,0xcc,0xff,0xfe,0x0d,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_max3_u16 v10, 0x64, v2, v3 ; encoding: [0x0a,0x40,0x30,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x30,0xcc,0xff,0x04,0x0e,0x1c,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_max3_u16 v10, s1, 0x64, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] clamp ; encoding: [0x0a,0xc8,0x30,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00]
+0x0a,0xc8,0x30,0xcc,0x01,0xfe,0x0d,0x14,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_max3_u16 v10, s1, v2, v3 ; encoding: [0x0a,0x40,0x30,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0x40,0x30,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_max3_u16 v10, s1, v2, v3 clamp ; encoding: [0x0a,0xc0,0x30,0xcc,0x01,0x04,0x0e,0x1c]
+0x0a,0xc0,0x30,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX1250: v_pk_max3_u16 v10, v1, 0x64, 0x64 ; encoding: [0x0a,0x40,0x30,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x30,0xcc,0x01,0xff,0xfd,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_max3_u16 v10, v1, v2, 0x64 ; encoding: [0x0a,0x40,0x30,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00]
+0x0a,0x40,0x30,0xcc,0x01,0x05,0xfe,0x1b,0x64,0x00,0x00,0x00
+
+# GFX1250: v_pk_max3_u16 v10, v1, v2, v3 ; encoding: [0x0a,0x40,0x30,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x40,0x30,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_max3_u16 v10, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x0a,0x50,0x30,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x50,0x30,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_max3_u16 v10, v1, v2, v3 op_sel:[0,1,1] ; encoding: [0x0a,0x70,0x30,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x70,0x30,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_max3_u16 v10, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x0a,0x48,0x30,0xcc,0x01,0x05,0x0e,0x1c]
+0x0a,0x48,0x30,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX1250: v_pk_max3_u16 v10, v1, v2, v3 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x0a,0x48,0x30,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x48,0x30,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_max3_u16 v10, v1, v2, v3 op_sel_hi:[0,1,1] ; encoding: [0x0a,0x40,0x30,0xcc,0x01,0x05,0x0e,0x14]
+0x0a,0x40,0x30,0xcc,0x01,0x05,0x0e,0x14
+
+# GFX1250: v_pk_max3_u16 v10, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x0a,0x00,0x30,0xcc,0x01,0x05,0x0e,0x0c]
+0x0a,0x00,0x30,0xcc,0x01,0x05,0x0e,0x0c
+
+# GFX1250: v_pk_add_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x23,0xcc,0xff,0xd6,0x00,0x68,0x0b,0xfe,0x00,0x00]
+0xff,0xd3,0x23,0xcc,0xff,0xd6,0x00,0x68,0x0b,0xfe,0x00,0x00
+
+# GFX1250: v_pk_add_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x23,0xcc,0xc1,0xfe,0x00,0x20]
+0x05,0x59,0x23,0xcc,0xc1,0xfe,0x00,0x20
+
+# GFX1250: v_pk_add_bf16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x23,0xcc,0xf0,0xfa,0x00,0x58]
+0x05,0x42,0x23,0xcc,0xf0,0xfa,0x00,0x58
+
+# GFX1250: v_pk_add_bf16 v5, exec_hi, null ; encoding: [0x05,0x40,0x23,0xcc,0x7f,0xf8,0x00,0x18]
+0x05,0x40,0x23,0xcc,0x7f,0xf8,0x00,0x18
+
+# GFX1250: v_pk_add_bf16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x23,0xcc,0x7e,0x82,0x01,0x18]
+0x05,0x40,0x23,0xcc,0x7e,0x82,0x01,0x18
+
+# GFX1250: v_pk_add_bf16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x23,0xcc,0x7d,0xe0,0x01,0x18]
+0x05,0x40,0x23,0xcc,0x7d,0xe0,0x01,0x18
+
+# GFX1250: v_pk_add_bf16 v5, null, exec_lo ; encoding: [0x05,0x40,0x23,0xcc,0x7c,0xfc,0x00,0x18]
+0x05,0x40,0x23,0xcc,0x7c,0xfc,0x00,0x18
+
+# GFX1250: v_pk_add_bf16 v5, s1, s2 ; encoding: [0x05,0x40,0x23,0xcc,0x01,0x04,0x00,0x18]
+0x05,0x40,0x23,0xcc,0x01,0x04,0x00,0x18
+
+# GFX1250: v_pk_add_bf16 v5, s105, s105 ; encoding: [0x05,0x40,0x23,0xcc,0x69,0xd2,0x00,0x18]
+0x05,0x40,0x23,0xcc,0x69,0xd2,0x00,0x18
+
+# GFX1250: v_pk_add_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x23,0xcc,0xfd,0xd4,0x00,0x10]
+0x05,0x48,0x23,0xcc,0xfd,0xd4,0x00,0x10
+
+# GFX1250: v_pk_add_bf16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x23,0xcc,0x7b,0xfa,0x01,0x18]
+0x05,0x40,0x23,0xcc,0x7b,0xfa,0x01,0x18
+
+# GFX1250: v_pk_add_bf16 v5, v1, v2 ; encoding: [0x05,0x40,0x23,0xcc,0x01,0x05,0x02,0x18]
+0x05,0x40,0x23,0xcc,0x01,0x05,0x02,0x18
+
+# GFX1250: v_pk_add_bf16 v5, v255, v255 ; encoding: [0x05,0x40,0x23,0xcc,0xff,0xff,0x03,0x18]
+0x05,0x40,0x23,0xcc,0xff,0xff,0x03,0x18
+
+# GFX1250: v_pk_add_bf16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x23,0xcc,0x6b,0xfe,0x01,0x18,0x0b,0xfe,0x00,0x00]
+0x05,0x40,0x23,0xcc,0x6b,0xfe,0x01,0x18,0x0b,0xfe,0x00,0x00
+
+# GFX1250: v_pk_add_bf16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x23,0xcc,0x6a,0xf6,0x00,0x18]
+0x05,0x40,0x23,0xcc,0x6a,0xf6,0x00,0x18
+
+# GFX1250: v_pk_mul_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x2a,0xcc,0xff,0xd6,0x00,0x68,0x0b,0xfe,0x00,0x00]
+0xff,0xd3,0x2a,0xcc,0xff,0xd6,0x00,0x68,0x0b,0xfe,0x00,0x00
+
+# GFX1250: v_pk_mul_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x2a,0xcc,0xc1,0xfe,0x00,0x20]
+0x05,0x59,0x2a,0xcc,0xc1,0xfe,0x00,0x20
+
+# GFX1250: v_pk_mul_bf16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x2a,0xcc,0xf0,0xfa,0x00,0x58]
+0x05,0x42,0x2a,0xcc,0xf0,0xfa,0x00,0x58
+
+# GFX1250: v_pk_mul_bf16 v5, exec_hi, null ; encoding: [0x05,0x40,0x2a,0xcc,0x7f,0xf8,0x00,0x18]
+0x05,0x40,0x2a,0xcc,0x7f,0xf8,0x00,0x18
+
+# GFX1250: v_pk_mul_bf16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x2a,0xcc,0x7e,0x82,0x01,0x18]
+0x05,0x40,0x2a,0xcc,0x7e,0x82,0x01,0x18
+
+# GFX1250: v_pk_mul_bf16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x2a,0xcc,0x7d,0xe0,0x01,0x18]
+0x05,0x40,0x2a,0xcc,0x7d,0xe0,0x01,0x18
+
+# GFX1250: v_pk_mul_bf16 v5, null, exec_lo ; encoding: [0x05,0x40,0x2a,0xcc,0x7c,0xfc,0x00,0x18]
+0x05,0x40,0x2a,0xcc,0x7c,0xfc,0x00,0x18
+
+# GFX1250: v_pk_mul_bf16 v5, s1, s2 ; encoding: [0x05,0x40,0x2a,0xcc,0x01,0x04,0x00,0x18]
+0x05,0x40,0x2a,0xcc,0x01,0x04,0x00,0x18
+
+# GFX1250: v_pk_mul_bf16 v5, s105, s105 ; encoding: [0x05,0x40,0x2a,0xcc,0x69,0xd2,0x00,0x18]
+0x05,0x40,0x2a,0xcc,0x69,0xd2,0x00,0x18
+
+# GFX1250: v_pk_mul_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x2a,0xcc,0xfd,0xd4,0x00,0x10]
+0x05,0x48,0x2a,0xcc,0xfd,0xd4,0x00,0x10
+
+# GFX1250: v_pk_mul_bf16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x2a,0xcc,0x7b,0xfa,0x01,0x18]
+0x05,0x40,0x2a,0xcc,0x7b,0xfa,0x01,0x18
+
+# GFX1250: v_pk_mul_bf16 v5, v1, v2 ; encoding: [0x05,0x40,0x2a,0xcc,0x01,0x05,0x02,0x18]
+0x05,0x40,0x2a,0xcc,0x01,0x05,0x02,0x18
+
+# GFX1250: v_pk_mul_bf16 v5, v255, v255 ; encoding: [0x05,0x40,0x2a,0xcc,0xff,0xff,0x03,0x18]
+0x05,0x40,0x2a,0xcc,0xff,0xff,0x03,0x18
+
+# GFX1250: v_pk_mul_bf16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x2a,0xcc,0x6b,0xfe,0x01,0x18,0x0b,0xfe,0x00,0x00]
+0x05,0x40,0x2a,0xcc,0x6b,0xfe,0x01,0x18,0x0b,0xfe,0x00,0x00
+
+# GFX1250: v_pk_mul_bf16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x2a,0xcc,0x6a,0xf6,0x00,0x18]
+0x05,0x40,0x2a,0xcc,0x6a,0xf6,0x00,0x18
+
+# GFX1250: v_pk_max_num_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x2c,0xcc,0xff,0xd6,0x00,0x68,0x0b,0xfe,0x00,0x00]
+0xff,0xd3,0x2c,0xcc,0xff,0xd6,0x00,0x68,0x0b,0xfe,0x00,0x00
+
+# GFX1250: v_pk_max_num_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x2c,0xcc,0xc1,0xfe,0x00,0x20]
+0x05,0x59,0x2c,0xcc,0xc1,0xfe,0x00,0x20
+
+# GFX1250: v_pk_max_num_bf16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x2c,0xcc,0xf0,0xfa,0x00,0x58]
+0x05,0x42,0x2c,0xcc,0xf0,0xfa,0x00,0x58
+
+# GFX1250: v_pk_max_num_bf16 v5, exec_hi, null ; encoding: [0x05,0x40,0x2c,0xcc,0x7f,0xf8,0x00,0x18]
+0x05,0x40,0x2c,0xcc,0x7f,0xf8,0x00,0x18
+
+# GFX1250: v_pk_max_num_bf16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x2c,0xcc,0x7e,0x82,0x01,0x18]
+0x05,0x40,0x2c,0xcc,0x7e,0x82,0x01,0x18
+
+# GFX1250: v_pk_max_num_bf16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x2c,0xcc,0x7d,0xe0,0x01,0x18]
+0x05,0x40,0x2c,0xcc,0x7d,0xe0,0x01,0x18
+
+# GFX1250: v_pk_max_num_bf16 v5, null, exec_lo ; encoding: [0x05,0x40,0x2c,0xcc,0x7c,0xfc,0x00,0x18]
+0x05,0x40,0x2c,0xcc,0x7c,0xfc,0x00,0x18
+
+# GFX1250: v_pk_max_num_bf16 v5, s1, s2 ; encoding: [0x05,0x40,0x2c,0xcc,0x01,0x04,0x00,0x18]
+0x05,0x40,0x2c,0xcc,0x01,0x04,0x00,0x18
+
+# GFX1250: v_pk_max_num_bf16 v5, s105, s105 ; encoding: [0x05,0x40,0x2c,0xcc,0x69,0xd2,0x00,0x18]
+0x05,0x40,0x2c,0xcc,0x69,0xd2,0x00,0x18
+
+# GFX1250: v_pk_max_num_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x2c,0xcc,0xfd,0xd4,0x00,0x10]
+0x05,0x48,0x2c,0xcc,0xfd,0xd4,0x00,0x10
+
+# GFX1250: v_pk_max_num_bf16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x2c,0xcc,0x7b,0xfa,0x01,0x18]
+0x05,0x40,0x2c,0xcc,0x7b,0xfa,0x01,0x18
+
+# GFX1250: v_pk_max_num_bf16 v5, v1, v2 ; encoding: [0x05,0x40,0x2c,0xcc,0x01,0x05,0x02,0x18]
+0x05,0x40,0x2c,0xcc,0x01,0x05,0x02,0x18
+
+# GFX1250: v_pk_max_num_bf16 v5, v255, v255 ; encoding: [0x05,0x40,0x2c,0xcc,0xff,0xff,0x03,0x18]
+0x05,0x40,0x2c,0xcc,0xff,0xff,0x03,0x18
+
+# GFX1250: v_pk_max_num_bf16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x2c,0xcc,0x6b,0xfe,0x01,0x18,0x0b,0xfe,0x00,0x00]
+0x05,0x40,0x2c,0xcc,0x6b,0xfe,0x01,0x18,0x0b,0xfe,0x00,0x00
+
+# GFX1250: v_pk_max_num_bf16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x2c,0xcc,0x6a,0xf6,0x00,0x18]
+0x05,0x40,0x2c,0xcc,0x6a,0xf6,0x00,0x18
+
+# GFX1250: v_pk_min_num_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x2b,0xcc,0xff,0xd6,0x00,0x68,0x0b,0xfe,0x00,0x00]
+0xff,0xd3,0x2b,0xcc,0xff,0xd6,0x00,0x68,0x0b,0xfe,0x00,0x00
+
+# GFX1250: v_pk_min_num_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x2b,0xcc,0xc1,0xfe,0x00,0x20]
+0x05,0x59,0x2b,0xcc,0xc1,0xfe,0x00,0x20
+
+# GFX1250: v_pk_min_num_bf16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x2b,0xcc,0xf0,0xfa,0x00,0x58]
+0x05,0x42,0x2b,0xcc,0xf0,0xfa,0x00,0x58
+
+# GFX1250: v_pk_min_num_bf16 v5, exec_hi, null ; encoding: [0x05,0x40,0x2b,0xcc,0x7f,0xf8,0x00,0x18]
+0x05,0x40,0x2b,0xcc,0x7f,0xf8,0x00,0x18
+
+# GFX1250: v_pk_min_num_bf16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x2b,0xcc,0x7e,0x82,0x01,0x18]
+0x05,0x40,0x2b,0xcc,0x7e,0x82,0x01,0x18
+
+# GFX1250: v_pk_min_num_bf16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x2b,0xcc,0x7d,0xe0,0x01,0x18]
+0x05,0x40,0x2b,0xcc,0x7d,0xe0,0x01,0x18
+
+# GFX1250: v_pk_min_num_bf16 v5, null, exec_lo ; encoding: [0x05,0x40,0x2b,0xcc,0x7c,0xfc,0x00,0x18]
+0x05,0x40,0x2b,0xcc,0x7c,0xfc,0x00,0x18
+
+# GFX1250: v_pk_min_num_bf16 v5, s1, s2 ; encoding: [0x05,0x40,0x2b,0xcc,0x01,0x04,0x00,0x18]
+0x05,0x40,0x2b,0xcc,0x01,0x04,0x00,0x18
+
+# GFX1250: v_pk_min_num_bf16 v5, s105, s105 ; encoding: [0x05,0x40,0x2b,0xcc,0x69,0xd2,0x00,0x18]
+0x05,0x40,0x2b,0xcc,0x69,0xd2,0x00,0x18
+
+# GFX1250: v_pk_min_num_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x2b,0xcc,0xfd,0xd4,0x00,0x10]
+0x05,0x48,0x2b,0xcc,0xfd,0xd4,0x00,0x10
+
+# GFX1250: v_pk_min_num_bf16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x2b,0xcc,0x7b,0xfa,0x01,0x18]
+0x05,0x40,0x2b,0xcc,0x7b,0xfa,0x01,0x18
+
+# GFX1250: v_pk_min_num_bf16 v5, v1, v2 ; encoding: [0x05,0x40,0x2b,0xcc,0x01,0x05,0x02,0x18]
+0x05,0x40,0x2b,0xcc,0x01,0x05,0x02,0x18
+
+# GFX1250: v_pk_min_num_bf16 v5, v255, v255 ; encoding: [0x05,0x40,0x2b,0xcc,0xff,0xff,0x03,0x18]
+0x05,0x40,0x2b,0xcc,0xff,0xff,0x03,0x18
+
+# GFX1250: v_pk_min_num_bf16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x2b,0xcc,0x6b,0xfe,0x01,0x18,0x0b,0xfe,0x00,0x00]
+0x05,0x40,0x2b,0xcc,0x6b,0xfe,0x01,0x18,0x0b,0xfe,0x00,0x00
+
+# GFX1250: v_pk_min_num_bf16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x2b,0xcc,0x6a,0xf6,0x00,0x18]
+0x05,0x40,0x2b,0xcc,0x6a,0xf6,0x00,0x18
+
+# GFX1250: v_pk_fma_bf16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[1,1,1] neg_hi:[1,1,1] clamp ; encoding: [0xff,0xa7,0x11,0xcc,0xff,0xd6,0xf0,0xf9,0x0b,0xfe,0x00,0x00]
+0xff,0xa7,0x11,0xcc,0xff,0xd6,0xf0,0xf9,0x0b,0xfe,0x00,0x00
+
+# GFX1250: v_pk_fma_bf16 v5, -1, exec_hi, src_scc neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x05,0x42,0x11,0xcc,0xc1,0xfe,0xf4,0x5b]
+0x05,0x42,0x11,0xcc,0xc1,0xfe,0xf4,0x5b
+
+# GFX1250: v_pk_fma_bf16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x05,0x4c,0x11,0xcc,0xf0,0xfa,0xc0,0x93]
+0x05,0x4c,0x11,0xcc,0xf0,0xfa,0xc0,0x93
+
+# GFX1250: v_pk_fma_bf16 v5, exec_hi, null, vcc_lo op_sel_hi:[0,1,0] ; encoding: [0x05,0x00,0x11,0xcc,0x7f,0xf8,0xa8,0x11]
+0x05,0x00,0x11,0xcc,0x7f,0xf8,0xa8,0x11
+
+# GFX1250: v_pk_fma_bf16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x11,0xcc,0x7e,0x82,0xad,0x01]
+0x05,0x40,0x11,0xcc,0x7e,0x82,0xad,0x01
+
+# GFX1250: v_pk_fma_bf16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x11,0xcc,0x7d,0xe0,0xf5,0x01]
+0x05,0x00,0x11,0xcc,0x7d,0xe0,0xf5,0x01
+
+# GFX1250: v_pk_fma_bf16 v5, null, exec_lo, 0xfe0b op_sel:[1,1,1] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x05,0x39,0x11,0xcc,0x7c,0xfc,0xfc,0x2b,0x0b,0xfe,0x00,0x00]
+0x05,0x39,0x11,0xcc,0x7c,0xfc,0xfc,0x2b,0x0b,0xfe,0x00,0x00
+
+# GFX1250: v_pk_fma_bf16 v5, s1, v255, exec_hi ; encoding: [0x05,0x40,0x11,0xcc,0x01,0xfe,0xff,0x19]
+0x05,0x40,0x11,0xcc,0x01,0xfe,0xff,0x19
+
+# GFX1250: v_pk_fma_bf16 v5, s105, s105, exec_lo ; encoding: [0x05,0x40,0x11,0xcc,0x69,0xd2,0xf8,0x19]
+0x05,0x40,0x11,0xcc,0x69,0xd2,0xf8,0x19
+
+# GFX1250: v_pk_fma_bf16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x11,0xcc,0xfd,0xd4,0x04,0x0b]
+0x05,0x50,0x11,0xcc,0xfd,0xd4,0x04,0x0b
+
+# GFX1250: v_pk_fma_bf16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x11,0xcc,0x7b,0xfa,0xed,0x19]
+0x05,0x40,0x11,0xcc,0x7b,0xfa,0xed,0x19
+
+# GFX1250: v_pk_fma_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x11,0xcc,0x01,0x05,0x0e,0x18]
+0x05,0x40,0x11,0xcc,0x01,0x05,0x0e,0x18
+
+# GFX1250: v_pk_fma_bf16 v5, v255, s2, s105 ; encoding: [0x05,0x40,0x11,0xcc,0xff,0x05,0xa4,0x19]
+0x05,0x40,0x11,0xcc,0xff,0x05,0xa4,0x19
+
+# GFX1250: v_pk_fma_bf16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x40,0x11,0xcc,0x6b,0xfe,0xfd,0x1f,0x0b,0xfe,0x00,0x00]
+0x05,0x40,0x11,0xcc,0x6b,0xfe,0xfd,0x1f,0x0b,0xfe,0x00,0x00
+
+# GFX1250: v_pk_fma_bf16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x40,0x11,0xcc,0x6a,0xf6,0x0c,0x1c]
+0x05,0x40,0x11,0xcc,0x6a,0xf6,0x0c,0x1c
+
+# GFX1250: v_pk_minimum3_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x36,0xcc,0x02,0x0b,0xca,0x1b]
+0x01,0x40,0x36,0xcc,0x02,0x0b,0xca,0x1b
+
+# GFX1250: v_pk_minimum3_f16 v1, v4, v9, v16 ; encoding: [0x01,0x40,0x36,0xcc,0x04,0x13,0x42,0x1c]
+0x01,0x40,0x36,0xcc,0x04,0x13,0x42,0x1c
+
+# GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 clamp ; encoding: [0x08,0xc0,0x36,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0xc0,0x36,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x36,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x44,0x36,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x36,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x42,0x36,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x36,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x41,0x36,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x36,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x47,0x36,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x9c]
+0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x9c
+
+# GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x5c]
+0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x5c
+
+# GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x3c]
+0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0x3c
+
+# GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0xfc]
+0x08,0x40,0x36,0xcc,0x01,0x03,0x10,0xfc
+
+# GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x36,0xcc,0x01,0x03,0x10,0xfc]
+0x08,0x47,0x36,0xcc,0x01,0x03,0x10,0xfc
+
+# GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x36,0xcc,0x01,0x03,0x10,0x04]
+0x08,0x60,0x36,0xcc,0x01,0x03,0x10,0x04
+
+# GFX1250: v_pk_minimum3_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x36,0xcc,0x01,0x03,0x10,0x04]
+0x08,0x00,0x36,0xcc,0x01,0x03,0x10,0x04
+
+# GFX1250: v_pk_maximum3_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x37,0xcc,0x02,0x0b,0xca,0x1b]
+0x01,0x40,0x37,0xcc,0x02,0x0b,0xca,0x1b
+
+# GFX1250: v_pk_maximum3_f16 v1, v4, v9, v16 ; encoding: [0x01,0x40,0x37,0xcc,0x04,0x13,0x42,0x1c]
+0x01,0x40,0x37,0xcc,0x04,0x13,0x42,0x1c
+
+# GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 clamp ; encoding: [0x08,0xc0,0x37,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0xc0,0x37,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x37,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x44,0x37,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x37,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x42,0x37,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x37,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x41,0x37,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x37,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x47,0x37,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x9c]
+0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x9c
+
+# GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x5c]
+0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x5c
+
+# GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x3c]
+0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0x3c
+
+# GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0xfc]
+0x08,0x40,0x37,0xcc,0x01,0x03,0x10,0xfc
+
+# GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x37,0xcc,0x01,0x03,0x10,0xfc]
+0x08,0x47,0x37,0xcc,0x01,0x03,0x10,0xfc
+
+# GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x37,0xcc,0x01,0x03,0x10,0x04]
+0x08,0x60,0x37,0xcc,0x01,0x03,0x10,0x04
+
+# GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x37,0xcc,0x01,0x03,0x10,0x04]
+0x08,0x00,0x37,0xcc,0x01,0x03,0x10,0x04
+
+# GFX1250: v_pk_min3_num_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x38,0xcc,0x02,0x0b,0xca,0x1b]
+0x01,0x40,0x38,0xcc,0x02,0x0b,0xca,0x1b
+
+# GFX1250: v_pk_min3_num_f16 v1, v4, v9, v16 ; encoding: [0x01,0x40,0x38,0xcc,0x04,0x13,0x42,0x1c]
+0x01,0x40,0x38,0xcc,0x04,0x13,0x42,0x1c
+
+# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 clamp ; encoding: [0x08,0xc0,0x38,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0xc0,0x38,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x38,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x44,0x38,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x38,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x42,0x38,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x38,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x41,0x38,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x38,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x47,0x38,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x9c]
+0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x9c
+
+# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x5c]
+0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x5c
+
+# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x3c]
+0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x3c
+
+# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0xfc]
+0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0xfc
+
+# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x38,0xcc,0x01,0x03,0x10,0xfc]
+0x08,0x47,0x38,0xcc,0x01,0x03,0x10,0xfc
+
+# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x38,0xcc,0x01,0x03,0x10,0x04]
+0x08,0x60,0x38,0xcc,0x01,0x03,0x10,0x04
+
+# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x38,0xcc,0x01,0x03,0x10,0x04]
+0x08,0x00,0x38,0xcc,0x01,0x03,0x10,0x04
+
+# GFX1250: v_pk_max3_num_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x39,0xcc,0x02,0x0b,0xca,0x1b]
+0x01,0x40,0x39,0xcc,0x02,0x0b,0xca,0x1b
+
+# GFX1250: v_pk_max3_num_f16 v1, v4, v9, v16 ; encoding: [0x01,0x40,0x39,0xcc,0x04,0x13,0x42,0x1c]
+0x01,0x40,0x39,0xcc,0x04,0x13,0x42,0x1c
+
+# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 clamp ; encoding: [0x08,0xc0,0x39,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0xc0,0x39,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x39,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x44,0x39,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x39,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x42,0x39,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x39,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x41,0x39,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x39,0xcc,0x01,0x03,0x10,0x1c]
+0x08,0x47,0x39,0xcc,0x01,0x03,0x10,0x1c
+
+# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x9c]
+0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x9c
+
+# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x5c]
+0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x5c
+
+# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x3c]
+0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x3c
+
+# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0xfc]
+0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0xfc
+
+# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x39,0xcc,0x01,0x03,0x10,0xfc]
+0x08,0x47,0x39,0xcc,0x01,0x03,0x10,0xfc
+
+# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x39,0xcc,0x01,0x03,0x10,0x04]
+0x08,0x60,0x39,0xcc,0x01,0x03,0x10,0x04
+
+# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x39,0xcc,0x01,0x03,0x10,0x04]
+0x08,0x00,0x39,0xcc,0x01,0x03,0x10,0x04
+
+# GFX1250: v_fma_mix_f32_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3d,0xcc,0xfd,0xd6,0xf0,0x61]
+0xff,0xa3,0x3d,0xcc,0xfd,0xd6,0xf0,0x61
+
+# GFX1250: v_fma_mix_f32_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3d,0xcc,0xc1,0xfa,0x04,0x53]
+0x05,0x0a,0x3d,0xcc,0xc1,0xfa,0x04,0x53
+
+# GFX1250: v_fma_mix_f32_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3d,0xcc,0x7d,0x82,0xa9,0x21]
+0x05,0x04,0x3d,0xcc,0x7d,0x82,0xa9,0x21
+
+# GFX1250: v_fma_mix_f32_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3d,0xcc,0x7f,0xfc,0xf8,0xf9]
+0x05,0x7f,0x3d,0xcc,0x7f,0xfc,0xf8,0xf9
+
+# GFX1250: v_fma_mix_f32_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3d,0xcc,0x7e,0xf8,0xf4,0xa3]
+0x05,0x05,0x3d,0xcc,0x7e,0xf8,0xf4,0xa3
+
+# GFX1250: v_fma_mix_f32_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3d,0xcc,0xf0,0xd4,0xfc,0xc9]
+0x05,0x16,0x3d,0xcc,0xf0,0xd4,0xfc,0xc9
+
+# GFX1250: v_fma_mix_f32_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3d,0xcc,0x7c,0xfe,0xc0,0x03]
+0x05,0x40,0x3d,0xcc,0x7c,0xfe,0xc0,0x03
+
+# GFX1250: v_fma_mix_f32_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x04,0x0c,0x04]
+0x05,0x00,0x3d,0xcc,0x01,0x04,0x0c,0x04
+
+# GFX1250: v_fma_mix_f32_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3d,0xcc,0x69,0xd2,0xf4,0x01]
+0x05,0x00,0x3d,0xcc,0x69,0xd2,0xf4,0x01
+
+# GFX1250: v_fma_mix_f32_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x05,0x0e,0x00]
+0x05,0x00,0x3d,0xcc,0x01,0x05,0x0e,0x00
+
+# GFX1250: v_fma_mix_f32_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3d,0xcc,0xff,0xff,0xa7,0x01]
+0x05,0x00,0x3d,0xcc,0xff,0xff,0xa7,0x01
+
+# GFX1250: v_fma_mix_f32_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3d,0xcc,0x6b,0xfa,0xfd,0x07]
+0x05,0x00,0x3d,0xcc,0x6b,0xfa,0xfd,0x07
+
+# GFX1250: v_fma_mix_f32_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3d,0xcc,0x6a,0xf6,0xec,0x01]
+0x05,0x00,0x3d,0xcc,0x6a,0xf6,0xec,0x01
+
+# GFX1250: v_fma_mix_f32_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3d,0xcc,0x7b,0xe0,0xad,0x81]
+0x05,0x01,0x3d,0xcc,0x7b,0xe0,0xad,0x81
+
+# GFX1250: v_fma_mixlo_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3e,0xcc,0xfd,0xd6,0xf0,0x61]
+0xff,0xa3,0x3e,0xcc,0xfd,0xd6,0xf0,0x61
+
+# GFX1250: v_fma_mixlo_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3e,0xcc,0xc1,0xfa,0x04,0x53]
+0x05,0x0a,0x3e,0xcc,0xc1,0xfa,0x04,0x53
+
+# GFX1250: v_fma_mixlo_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3e,0xcc,0x7d,0x82,0xa9,0x21]
+0x05,0x04,0x3e,0xcc,0x7d,0x82,0xa9,0x21
+
+# GFX1250: v_fma_mixlo_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3e,0xcc,0x7f,0xfc,0xf8,0xf9]
+0x05,0x7f,0x3e,0xcc,0x7f,0xfc,0xf8,0xf9
+
+# GFX1250: v_fma_mixlo_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3e,0xcc,0x7e,0xf8,0xf4,0xa3]
+0x05,0x05,0x3e,0xcc,0x7e,0xf8,0xf4,0xa3
+
+# GFX1250: v_fma_mixlo_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3e,0xcc,0xf0,0xd4,0xfc,0xc9]
+0x05,0x16,0x3e,0xcc,0xf0,0xd4,0xfc,0xc9
+
+# GFX1250: v_fma_mixlo_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3e,0xcc,0x7c,0xfe,0xc0,0x03]
+0x05,0x40,0x3e,0xcc,0x7c,0xfe,0xc0,0x03
+
+# GFX1250: v_fma_mixlo_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x04,0x0c,0x04]
+0x05,0x00,0x3e,0xcc,0x01,0x04,0x0c,0x04
+
+# GFX1250: v_fma_mixlo_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3e,0xcc,0x69,0xd2,0xf4,0x01]
+0x05,0x00,0x3e,0xcc,0x69,0xd2,0xf4,0x01
+
+# GFX1250: v_fma_mixlo_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x05,0x0e,0x00]
+0x05,0x00,0x3e,0xcc,0x01,0x05,0x0e,0x00
+
+# GFX1250: v_fma_mixlo_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3e,0xcc,0xff,0xff,0xa7,0x01]
+0x05,0x00,0x3e,0xcc,0xff,0xff,0xa7,0x01
+
+# GFX1250: v_fma_mixlo_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3e,0xcc,0x6b,0xfa,0xfd,0x07]
+0x05,0x00,0x3e,0xcc,0x6b,0xfa,0xfd,0x07
+
+# GFX1250: v_fma_mixlo_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3e,0xcc,0x6a,0xf6,0xec,0x01]
+0x05,0x00,0x3e,0xcc,0x6a,0xf6,0xec,0x01
+
+# GFX1250: v_fma_mixlo_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3e,0xcc,0x7b,0xe0,0xad,0x81]
+0x05,0x01,0x3e,0xcc,0x7b,0xe0,0xad,0x81
+
+# GFX1250: v_fma_mixhi_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3f,0xcc,0xfd,0xd6,0xf0,0x61]
+0xff,0xa3,0x3f,0xcc,0xfd,0xd6,0xf0,0x61
+
+# GFX1250: v_fma_mixhi_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3f,0xcc,0xc1,0xfa,0x04,0x53]
+0x05,0x0a,0x3f,0xcc,0xc1,0xfa,0x04,0x53
+
+# GFX1250: v_fma_mixhi_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3f,0xcc,0x7d,0x82,0xa9,0x21]
+0x05,0x04,0x3f,0xcc,0x7d,0x82,0xa9,0x21
+
+# GFX1250: v_fma_mixhi_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3f,0xcc,0x7f,0xfc,0xf8,0xf9]
+0x05,0x7f,0x3f,0xcc,0x7f,0xfc,0xf8,0xf9
+
+# GFX1250: v_fma_mixhi_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3f,0xcc,0x7e,0xf8,0xf4,0xa3]
+0x05,0x05,0x3f,0xcc,0x7e,0xf8,0xf4,0xa3
+
+# GFX1250: v_fma_mixhi_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3f,0xcc,0xf0,0xd4,0xfc,0xc9]
+0x05,0x16,0x3f,0xcc,0xf0,0xd4,0xfc,0xc9
+
+# GFX1250: v_fma_mixhi_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3f,0xcc,0x7c,0xfe,0xc0,0x03]
+0x05,0x40,0x3f,0xcc,0x7c,0xfe,0xc0,0x03
+
+# GFX1250: v_fma_mixhi_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x04,0x0c,0x04]
+0x05,0x00,0x3f,0xcc,0x01,0x04,0x0c,0x04
+
+# GFX1250: v_fma_mixhi_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3f,0xcc,0x69,0xd2,0xf4,0x01]
+0x05,0x00,0x3f,0xcc,0x69,0xd2,0xf4,0x01
+
+# GFX1250: v_fma_mixhi_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x05,0x0e,0x00]
+0x05,0x00,0x3f,0xcc,0x01,0x05,0x0e,0x00
+
+# GFX1250: v_fma_mixhi_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3f,0xcc,0xff,0xff,0xa7,0x01]
+0x05,0x00,0x3f,0xcc,0xff,0xff,0xa7,0x01
+
+# GFX1250: v_fma_mixhi_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3f,0xcc,0x6b,0xfa,0xfd,0x07]
+0x05,0x00,0x3f,0xcc,0x6b,0xfa,0xfd,0x07
+
+# GFX1250: v_fma_mixhi_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3f,0xcc,0x6a,0xf6,0xec,0x01]
+0x05,0x00,0x3f,0xcc,0x6a,0xf6,0xec,0x01
+
+# GFX1250: v_fma_mixhi_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3f,0xcc,0x7b,0xe0,0xad,0x81]
+0x05,0x01,0x3f,0xcc,0x7b,0xe0,0xad,0x81
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
index d76ec4c..e20f020 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
@@ -364,6 +364,45 @@
0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c
# GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c]
+0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14]
+
+0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c]
+
+0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14]
+
+0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03]
+
+0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c]
+
+0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84]
+
0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b
# GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt
index 618e081..802d6368 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt
@@ -11310,6 +11310,18 @@
# CHECK: v_alignbit_b32 v5, v1, v2, exec_hi ; encoding: [0x05,0x00,0xce,0xd1,0x01,0x05,0xfe,0x01]
0x05,0x00,0xce,0xd1,0x01,0x05,0xfe,0x01
+# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04]
+0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04
+
+# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04]
+0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04
+
+# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04]
+0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04
+
+# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04]
+0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04
+
# CHECK: v_alignbyte_b32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04]
0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04
@@ -11406,6 +11418,18 @@
# CHECK: v_alignbyte_b32 v5, v1, v2, exec_hi ; encoding: [0x05,0x00,0xcf,0xd1,0x01,0x05,0xfe,0x01]
0x05,0x00,0xcf,0xd1,0x01,0x05,0xfe,0x01
+# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04]
+0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04
+
+# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04]
+0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04
+
+# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04]
+0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04
+
+# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04]
+0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04
+
# CHECK: v_min3_f32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04]
0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04
diff --git a/llvm/test/MC/ELF/AArch64/cfi.s b/llvm/test/MC/ELF/AArch64/cfi.s
index 6bdf03c..7047f92 100644
--- a/llvm/test/MC/ELF/AArch64/cfi.s
+++ b/llvm/test/MC/ELF/AArch64/cfi.s
@@ -557,12 +557,14 @@ f37:
// CHECK-NEXT: }
.ifdef ERR
-// ERR: [[#@LINE+1]]:15: error: expected .eh_frame or .debug_frame
+// ERR: [[#@LINE+1]]:15: error: expected .eh_frame, .debug_frame, or .sframe
.cfi_sections $
// ERR: [[#@LINE+1]]:28: error: expected comma
.cfi_sections .debug_frame $
// ERR: [[#@LINE+1]]:39: error: expected comma
.cfi_sections .debug_frame, .eh_frame $
+// ERR: [[#@LINE+1]]:48: error: expected comma
+.cfi_sections .debug_frame, .eh_frame, .sframe $
// ERR: [[#@LINE+1]]:16: error: unexpected token
.cfi_startproc $
diff --git a/llvm/test/MC/ELF/cfi.s b/llvm/test/MC/ELF/cfi.s
index 3bd16ae..b7f9371 100644
--- a/llvm/test/MC/ELF/cfi.s
+++ b/llvm/test/MC/ELF/cfi.s
@@ -445,12 +445,14 @@ f37:
// CHECK: }
.ifdef ERR
-// ERR: [[#@LINE+1]]:15: error: expected .eh_frame or .debug_frame
+// ERR: [[#@LINE+1]]:15: error: expected .eh_frame, .debug_frame, or .sframe
.cfi_sections $
// ERR: [[#@LINE+1]]:28: error: expected comma
.cfi_sections .debug_frame $
// ERR: [[#@LINE+1]]:39: error: expected comma
.cfi_sections .debug_frame, .eh_frame $
+// ERR: [[#@LINE+1]]:48: error: expected comma
+.cfi_sections .debug_frame, .eh_frame, .sframe $
// ERR: [[#@LINE+1]]:16: error: unexpected token
.cfi_startproc $
diff --git a/llvm/test/MC/ELF/mc-dump.s b/llvm/test/MC/ELF/mc-dump.s
index 5cc2e9f..fd6cf95 100644
--- a/llvm/test/MC/ELF/mc-dump.s
+++ b/llvm/test/MC/ELF/mc-dump.s
@@ -6,9 +6,9 @@
#CHECK-LABEL:assembler backend - final-layout
# CHECK:Sections:[
# CHECK-NEXT:MCSection Name:.text
-# CHECK-NEXT:0 Data Size:0 []
+# CHECK-NEXT:0 Align Size:0+0 []
+# CHECK-NEXT: Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops
# CHECK-NEXT: Symbol @0 .text
-# CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops
# CHECK-NEXT:0 Data Size:0 []
# CHECK-NEXT: Symbol @0 _start
# CHECK-NEXT: Symbol @0 Temporary
@@ -22,9 +22,9 @@
# CHECK-NEXT: Symbol @0 Temporary
# CHECK-NEXT: Symbol @16 Temporary
# CHECK-NEXT:MCSection Name:.data
-# CHECK-NEXT:0 Data Size:0 []
+# CHECK-NEXT:0 Align Size:0+0 []
+# CHECK-NEXT: Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4
# CHECK-NEXT: Symbol @0 .data
-# CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4
# CHECK-NEXT:0 Data Size:4 [01,00,00,00]
# CHECK-NEXT:4 Fill Value:0 ValueSize:1 NumValues:1
# CHECK-NEXT:5 LEB Size:0+1 [15] Value:.Ltmp0-_start Signed:0
diff --git a/llvm/test/MC/ELF/nobits-non-zero-value.s b/llvm/test/MC/ELF/nobits-non-zero-value.s
index ff43e69..ea95ec97 100644
--- a/llvm/test/MC/ELF/nobits-non-zero-value.s
+++ b/llvm/test/MC/ELF/nobits-non-zero-value.s
@@ -1,26 +1,45 @@
-# RUN: not llvm-mc -filetype=obj -triple=x86_64 %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error:
+# RUN: not llvm-mc -filetype=obj -triple=x86_64 %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error: --implicit-check-not=warning:
## -filetype=asm does not check the error.
# RUN: llvm-mc -triple=x86_64 %s
.section .tbss,"aw",@nobits
-# MCRelaxableFragment
-# CHECK: {{.*}}.s:[[#@LINE+1]]:3: error: SHT_NOBITS section '.tbss' cannot have instructions
jmp foo
.bss
-# CHECK: {{.*}}.s:[[#@LINE+1]]:3: error: SHT_NOBITS section '.bss' cannot have instructions
addb %al,(%rax)
-# CHECK: {{.*}}.s:[[#@LINE+1]]:11: warning: ignoring non-zero fill value in SHT_NOBITS section '.bss'
+# CHECK: {{.*}}.s:[[#@LINE+1]]:11: warning: ignoring non-zero fill value in BSS section '.bss'
.align 4, 42
-# CHECK-NOT: {{.*}}.s:[[#@LINE+1]]:11: warning: ignoring non-zero fill value in SHT_NOBITS section '.bss'
-.align 4, 0
-
-# CHECK: <unknown>:0: error: SHT_NOBITS section '.bss' cannot have non-zero initializers
.long 1
-.section .bss1,"aw",%nobits
-# CHECK: <unknown>:0: error: SHT_NOBITS section '.bss1' cannot have fixups
+.section .bss0,"aw",%nobits
+addb %al,(%rax)
+
+.section data_fixup,"aw",%nobits
.quad foo
+
+.section fill,"aw",%nobits
+.fill b-a,1,1
+
+.section org,"aw",%nobits
+.org 1,1
+
+.section ok,"aw",%nobits
+.org 1
+.fill 1
+.fill b-a,1,0
+.align 4, 0
+.long 0
+
+.text
+a: nop
+b:
+
+## Location is not tracked for efficiency.
+# CHECK: <unknown>:0: error: BSS section '.tbss' cannot have non-zero bytes
+# CHECK: <unknown>:0: error: BSS section '.bss' cannot have non-zero bytes
+# CHECK: <unknown>:0: error: BSS section 'data_fixup' cannot have fixups
+# CHECK: <unknown>:0: error: BSS section 'fill' cannot have non-zero bytes
+# CHECK: <unknown>:0: error: BSS section 'org' cannot have non-zero bytes
diff --git a/llvm/test/MC/ELF/section-sym-err.s b/llvm/test/MC/ELF/section-sym-err.s
index afed21d..2f7ab69 100644
--- a/llvm/test/MC/ELF/section-sym-err.s
+++ b/llvm/test/MC/ELF/section-sym-err.s
@@ -1,6 +1,9 @@
-// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t.o 2>&1 | FileCheck %s
+# RUN: not llvm-mc -filetype=obj -triple x86_64 %s -o %t 2>&1 | FileCheck %s
.section foo
foo:
+# CHECK: [[#@LINE-1]]:1: error: symbol 'foo' is already defined
-// CHECK: error: symbol 'foo' is already defined
+x1:
+.section x1
+# CHECK: <unknown>:0: error: invalid symbol redefinition
diff --git a/llvm/test/MC/ELF/section-sym-err2.s b/llvm/test/MC/ELF/section-sym-err2.s
deleted file mode 100644
index 27d8e9a..0000000
--- a/llvm/test/MC/ELF/section-sym-err2.s
+++ /dev/null
@@ -1,6 +0,0 @@
-// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t.o 2>&1 | FileCheck %s
-
-foo:
-.section foo
-
-// CHECK: error: invalid symbol redefinition
diff --git a/llvm/test/MC/ELF/section-sym2.s b/llvm/test/MC/ELF/section-sym2.s
index b404ef7..167fc8c 100644
--- a/llvm/test/MC/ELF/section-sym2.s
+++ b/llvm/test/MC/ELF/section-sym2.s
@@ -1,24 +1,27 @@
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj --symbols -r --expand-relocs - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple x86_64 %s -o %t
+# RUN: llvm-readelf -Srs %t | FileCheck %s
-// Test that we can forward reference a section.
+## Test that we can forward reference a section.
mov .rodata, %rsi
-.section .rodata
+mov .debug_info, %rsi
-// CHECK:Relocations [
-// CHECK: Section {{.*}} .rela.text {
-// CHECK: Relocation {
-// CHECK: Offset: 0x4
-// CHECK: Type: R_X86_64_32S (11)
-// CHECK: Symbol: .rodata
-// CHECK: Addend: 0x0
-// CHECK: }
-// CHECK: }
-// CHECK:]
+.section .rodata,"a"
+.section .debug_info,"G",@progbits,11,comdat; .long x1
+.section .debug_info,"G",@progbits,22,comdat; .long x2
+.section .debug_info,"",@progbits; .long x0
-// There is only one .rodata symbol
+# CHECK: Relocation section '.rela.debug_info' at offset {{.*}} contains 1
+# CHECK: Relocation section '.rela.debug_info' at offset {{.*}} contains 1
+# CHECK: Relocation section '.rela.debug_info' at offset {{.*}} contains 1
-// CHECK:Symbols [
-// CHECK: Type: Section (0x3)
-// CHECK: Section: .rodata
-// CHECK-NOT: Section: .rodata
+# CHECK: Symbol table '.symtab' contains 8 entries:
+# CHECK-NEXT: Num: Value Size Type Bind Vis Ndx Name
+# CHECK-NEXT: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND
+# CHECK-NEXT: 0000000000000000 0 SECTION LOCAL DEFAULT 4 .rodata
+# CHECK-NEXT: 0000000000000000 0 SECTION LOCAL DEFAULT 11 .debug_info
+# CHECK-NEXT: 0000000000000000 0 NOTYPE LOCAL DEFAULT 5 11
+# CHECK-NEXT: 0000000000000000 0 NOTYPE LOCAL DEFAULT 8 22
+# CHECK-NEXT: 0000000000000000 0 NOTYPE GLOBAL DEFAULT UND x1
+# CHECK-NEXT: 0000000000000000 0 NOTYPE GLOBAL DEFAULT UND x2
+# CHECK-NEXT: 0000000000000000 0 NOTYPE GLOBAL DEFAULT UND x0
diff --git a/llvm/test/MC/RISCV/Relocations/mc-dump.s b/llvm/test/MC/RISCV/Relocations/mc-dump.s
index f722584..e8f4b14 100644
--- a/llvm/test/MC/RISCV/Relocations/mc-dump.s
+++ b/llvm/test/MC/RISCV/Relocations/mc-dump.s
@@ -3,16 +3,18 @@
# CHECK:Sections:[
# CHECK-NEXT:MCSection Name:.text
-# CHECK-NEXT:0 Data Size:0 []
+# CHECK-NEXT:0 Align Size:0+0 []
+# CHECK-NEXT: Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops
# CHECK-NEXT: Symbol @0 .text
-# CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops
# CHECK-NEXT:0 Data LinkerRelaxable Size:8 [97,00,00,00,e7,80,00,00]
# CHECK-NEXT: Fixup @0 Value:specifier(19,ext) Kind:4023
# CHECK-NEXT: Symbol @0 $x
-# CHECK-NEXT:8 Data Size:0 []
-# CHECK-NEXT:8 Align Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
-# CHECK-NEXT:12 Data Size:4 [13,05,30,00]
-# CHECK-NEXT:16 Align Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
+# CHECK-NEXT:8 Align Size:0+4 []
+# CHECK-NEXT: Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
+# CHECK-NEXT: Fixup @0 Value:4 Kind:[[#]]
+# CHECK-NEXT:12 Align Size:4+4 [13,05,30,00]
+# CHECK-NEXT: Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
+# CHECK-NEXT: Fixup @4 Value:4 Kind:[[#]]
# CHECK-NEXT:]
call ext
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index b7cd712..19cc4d5 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -448,7 +448,7 @@
# CHECK: .attribute 5, "rv32i2p1_zilsd1p0"
.attribute arch, "rv64i_xsfvfwmaccqqq"
-# CHECK: attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl32b1p0_xsfvfwmaccqqq1p0"
+# CHECK: attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_xsfvfwmaccqqq1p0"
.attribute arch, "rv32i_ssnpm1p0"
# CHECK: attribute 5, "rv32i2p1_ssnpm1p0"
diff --git a/llvm/test/MC/RISCV/rv32p-valid.s b/llvm/test/MC/RISCV/rv32p-valid.s
index c259c14..ffff0f2 100644
--- a/llvm/test/MC/RISCV/rv32p-valid.s
+++ b/llvm/test/MC/RISCV/rv32p-valid.s
@@ -71,8 +71,8 @@ psabs.h a1, a2
# CHECK-ASM: encoding: [0x9b,0x22,0x73,0xe4]
psabs.b t0, t1
# CHECK-ASM-AND-OBJ: plui.h gp, 32
-# CHECK-ASM: encoding: [0x9b,0x21,0x20,0xf0]
+# CHECK-ASM: encoding: [0x9b,0x21,0x08,0xf0]
plui.h gp, 32
# CHECK-ASM-AND-OBJ: plui.h gp, -412
-# CHECK-ASM: encoding: [0x9b,0xa1,0x64,0xf0]
+# CHECK-ASM: encoding: [0x9b,0x21,0x99,0xf0]
plui.h gp, 612
diff --git a/llvm/test/MC/RISCV/rv64p-valid.s b/llvm/test/MC/RISCV/rv64p-valid.s
index 3ea6b00..a0d6ead 100644
--- a/llvm/test/MC/RISCV/rv64p-valid.s
+++ b/llvm/test/MC/RISCV/rv64p-valid.s
@@ -95,13 +95,13 @@ psabs.h t1, t5
# CHECK-ASM: encoding: [0x1b,0x25,0x79,0xe4]
psabs.b a0, s2
# CHECK-ASM-AND-OBJ: plui.h s2, 4
-# CHECK-ASM: encoding: [0x1b,0x29,0x04,0xf0]
+# CHECK-ASM: encoding: [0x1b,0x29,0x01,0xf0]
plui.h s2, 4
# CHECK-ASM-AND-OBJ: plui.h gp, -412
-# CHECK-ASM: encoding: [0x9b,0xa1,0x64,0xf0]
+# CHECK-ASM: encoding: [0x9b,0x21,0x99,0xf0]
plui.h gp, 612
# CHECK-ASM-AND-OBJ: plui.w a2, 1
-# CHECK-ASM: encoding: [0x1b,0x26,0x01,0xf2]
+# CHECK-ASM: encoding: [0x1b,0x26,0x00,0xf3]
plui.w a2, 1
# CHECK-ASM-AND-OBJ: plui.w a2, -1
# CHECK-ASM: encoding: [0x1b,0xa6,0xff,0xf3]
diff --git a/llvm/test/MC/X86/intel-syntax-parentheses.s b/llvm/test/MC/X86/intel-syntax-parentheses.s
new file mode 100644
index 0000000..ae53f64
--- /dev/null
+++ b/llvm/test/MC/X86/intel-syntax-parentheses.s
@@ -0,0 +1,10 @@
+// RUN: not llvm-mc -triple x86_64-unknown-unknown %s 2>&1 | FileCheck %s
+
+.intel_syntax
+
+// CHECK: error: invalid base+index expression
+ lea rdi, [(label + rsi) + rip]
+// CHECK: leaq 1(%rax,%rdi), %rdi
+ lea rdi, [(rax + rdi) + 1]
+label:
+ .quad 42
diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll
index db398d6..6fa57f1 100644
--- a/llvm/test/Other/new-pm-print-pipeline.ll
+++ b/llvm/test/Other/new-pm-print-pipeline.ll
@@ -32,7 +32,7 @@
; CHECK-10: function(loop-unroll<O2>,loop-unroll<partial;peeling;runtime;upperbound;profile-peeling;full-unroll-max=5;O1>,loop-unroll<no-partial;no-peeling;no-runtime;no-upperbound;no-profile-peeling;full-unroll-max=7;O1>)
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep;no-memoryssa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-11
-; CHECK-11: function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep;no-memoryssa>)
+; CHECK-11: function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;no-memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;memdep;no-memoryssa>)
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(early-cse<>,early-cse<memssa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-12
; CHECK-12: function(early-cse<>,early-cse<memssa>)
diff --git a/llvm/test/TableGen/SDNodeInfoEmitter/basic.td b/llvm/test/TableGen/SDNodeInfoEmitter/advanced.td
index 2b4c76a..d7eeaba 100644
--- a/llvm/test/TableGen/SDNodeInfoEmitter/basic.td
+++ b/llvm/test/TableGen/SDNodeInfoEmitter/advanced.td
@@ -1,99 +1,4 @@
-// RUN: split-file %s %t
-
-//--- no-nodes.td
-// RUN: llvm-tblgen -gen-sd-node-info -I %p/../../../include %t/no-nodes.td \
-// RUN: | FileCheck %t/no-nodes.td
-
-include "llvm/Target/Target.td"
-
-def MyTarget : Target;
-
-// CHECK: #ifdef GET_SDNODE_ENUM
-// CHECK-NEXT: #undef GET_SDNODE_ENUM
-// CHECK-EMPTY:
-// CHECK-NEXT: namespace llvm::MyTargetISD {
-// CHECK-EMPTY:
-// CHECK-NEXT: static constexpr unsigned GENERATED_OPCODE_END = ISD::BUILTIN_OP_END;
-// CHECK-EMPTY:
-// CHECK-NEXT: } // namespace llvm::MyTargetISD
-// CHECK-EMPTY:
-// CHECK-NEXT: #endif // GET_SDNODE_ENUM
-// CHECK-EMPTY:
-// CHECK-NEXT: #ifdef GET_SDNODE_DESC
-// CHECK-NEXT: #undef GET_SDNODE_DESC
-// CHECK-EMPTY:
-// CHECK-NEXT: namespace llvm {
-// CHECK-EMPTY:
-// CHECK-NEXT: #ifdef __GNUC__
-// CHECK-NEXT: #pragma GCC diagnostic push
-// CHECK-NEXT: #pragma GCC diagnostic ignored "-Woverlength-strings"
-// CHECK-NEXT: #endif
-// CHECK-NEXT: static constexpr char MyTargetSDNodeNamesStorage[] =
-// CHECK-NEXT: "\0"
-// CHECK-NEXT: ;
-// CHECK-NEXT: #ifdef __GNUC__
-// CHECK-NEXT: #pragma GCC diagnostic pop
-// CHECK-NEXT: #endif
-// CHECK-EMPTY:
-// CHECK-NEXT: static constexpr llvm::StringTable
-// CHECK-NEXT: MyTargetSDNodeNames = MyTargetSDNodeNamesStorage;
-// CHECK-EMPTY:
-// CHECK-NEXT: static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
-// CHECK-NEXT: /* dummy */ {SDTCisVT, 0, 0, MVT::INVALID_SIMPLE_VALUE_TYPE}
-// CHECK-NEXT: };
-// CHECK-EMPTY:
-// CHECK-NEXT: static const SDNodeDesc MyTargetSDNodeDescs[] = {
-// CHECK-NEXT: };
-// CHECK-EMPTY:
-// CHECK-NEXT: static const SDNodeInfo MyTargetGenSDNodeInfo(
-// CHECK-NEXT: /*NumOpcodes=*/0, MyTargetSDNodeDescs,
-// CHECK-NEXT: MyTargetSDNodeNames, MyTargetSDTypeConstraints);
-// CHECK-EMPTY:
-// CHECK-NEXT: } // namespace llvm
-// CHECK-EMPTY:
-// CHECK-NEXT: #endif // GET_SDNODE_DESC
-
-
-//--- trivial-node.td
-// RUN: llvm-tblgen -gen-sd-node-info -I %p/../../../include %t/trivial-node.td \
-// RUN: | FileCheck %t/trivial-node.td
-
-include "llvm/Target/Target.td"
-
-def MyTarget : Target;
-
-def my_noop : SDNode<"MyTargetISD::NOOP", SDTypeProfile<0, 0, []>>;
-
-// CHECK: namespace llvm::MyTargetISD {
-// CHECK-EMPTY:
-// CHECK-NEXT: enum GenNodeType : unsigned {
-// CHECK-NEXT: NOOP = ISD::BUILTIN_OP_END,
-// CHECK-NEXT: };
-// CHECK-EMPTY:
-// CHECK-NEXT: static constexpr unsigned GENERATED_OPCODE_END = NOOP + 1;
-// CHECK-EMPTY:
-// CHECK-NEXT: } // namespace llvm::MyTargetISD
-
-// CHECK: static constexpr char MyTargetSDNodeNamesStorage[] =
-// CHECK-NEXT: "\0"
-// CHECK-NEXT: "MyTargetISD::NOOP\0"
-// CHECK-NEXT: ;
-
-// CHECK: static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
-// CHECK-NEXT: /* dummy */ {SDTCisVT, 0, 0, MVT::INVALID_SIMPLE_VALUE_TYPE}
-// CHECK-NEXT: };
-// CHECK-EMPTY:
-// CHECK-NEXT: static const SDNodeDesc MyTargetSDNodeDescs[] = {
-// CHECK-NEXT: {0, 0, 0, 0, 0, 1, 0, 0}, // NOOP
-// CHECK-NEXT: };
-// CHECK-EMPTY:
-// CHECK-NEXT: static const SDNodeInfo MyTargetGenSDNodeInfo(
-// CHECK-NEXT: /*NumOpcodes=*/1, MyTargetSDNodeDescs,
-// CHECK-NEXT: MyTargetSDNodeNames, MyTargetSDTypeConstraints);
-
-//--- advanced.td
-// RUN: llvm-tblgen -gen-sd-node-info -I %p/../../../include %t/advanced.td \
-// RUN: | FileCheck %t/advanced.td
+// RUN: llvm-tblgen -gen-sd-node-info -I %p/../../../include %s | FileCheck %s
include "llvm/Target/Target.td"
diff --git a/llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints-1.td b/llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints-1.td
new file mode 100644
index 0000000..8b86f93
--- /dev/null
+++ b/llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints-1.td
@@ -0,0 +1,29 @@
+// RUN: llvm-tblgen -gen-sd-node-info -I %p/../../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+def MyTarget : Target;
+
+def my_node_a : SDNode<"MyTargetISD::NODE", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>>;
+def my_node_b : SDNode<"MyTargetISD::NODE", SDTypeProfile<1, 0, [SDTCisVT<0, f32>]>>;
+
+// CHECK: enum GenNodeType : unsigned {
+// CHECK-NEXT: NODE = ISD::BUILTIN_OP_END,
+// CHECK-NEXT: };
+
+// CHECK: static constexpr char MyTargetSDNodeNamesStorage[] =
+// CHECK-NEXT: "\0"
+// CHECK-NEXT: "MyTargetISD::NODE\0"
+// CHECK-NEXT: ;
+
+// CHECK: static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
+// CHECK-NEXT: /* dummy */ {SDTCisVT, 0, 0, MVT::INVALID_SIMPLE_VALUE_TYPE}
+// CHECK-NEXT: };
+// CHECK-EMPTY:
+// CHECK-NEXT: static const SDNodeDesc MyTargetSDNodeDescs[] = {
+// CHECK-NEXT: {1, 0, 0, 0, 0, 1, 0, 0}, // NODE
+// CHECK-NEXT: };
+// CHECK-EMPTY:
+// CHECK-NEXT: static const SDNodeInfo MyTargetGenSDNodeInfo(
+// CHECK-NEXT: /*NumOpcodes=*/1, MyTargetSDNodeDescs,
+// CHECK-NEXT: MyTargetSDNodeNames, MyTargetSDTypeConstraints);
diff --git a/llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints.td b/llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints-2.td
index c09e219..29429e9 100644
--- a/llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints.td
+++ b/llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints-2.td
@@ -1,39 +1,4 @@
-// RUN: split-file %s %t
-
-//--- test1.td
-// RUN: llvm-tblgen -gen-sd-node-info -I %p/../../../include %t/test1.td | FileCheck %t/test1.td
-
-include "llvm/Target/Target.td"
-
-def MyTarget : Target;
-
-def my_node_a : SDNode<"MyTargetISD::NODE", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>>;
-def my_node_b : SDNode<"MyTargetISD::NODE", SDTypeProfile<1, 0, [SDTCisVT<0, f32>]>>;
-
-// CHECK: enum GenNodeType : unsigned {
-// CHECK-NEXT: NODE = ISD::BUILTIN_OP_END,
-// CHECK-NEXT: };
-
-// CHECK: static constexpr char MyTargetSDNodeNamesStorage[] =
-// CHECK-NEXT: "\0"
-// CHECK-NEXT: "MyTargetISD::NODE\0"
-// CHECK-NEXT: ;
-
-// CHECK: static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
-// CHECK-NEXT: /* dummy */ {SDTCisVT, 0, 0, MVT::INVALID_SIMPLE_VALUE_TYPE}
-// CHECK-NEXT: };
-// CHECK-EMPTY:
-// CHECK-NEXT: static const SDNodeDesc MyTargetSDNodeDescs[] = {
-// CHECK-NEXT: {1, 0, 0, 0, 0, 1, 0, 0}, // NODE
-// CHECK-NEXT: };
-// CHECK-EMPTY:
-// CHECK-NEXT: static const SDNodeInfo MyTargetGenSDNodeInfo(
-// CHECK-NEXT: /*NumOpcodes=*/1, MyTargetSDNodeDescs,
-// CHECK-NEXT: MyTargetSDNodeNames, MyTargetSDTypeConstraints);
-
-
-//--- test2.td
-// RUN: llvm-tblgen -gen-sd-node-info -I %p/../../../include %t/test2.td | FileCheck %t/test2.td
+// RUN: llvm-tblgen -gen-sd-node-info -I %p/../../../include %s | FileCheck %s
include "llvm/Target/Target.td"
diff --git a/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td b/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td
new file mode 100644
index 0000000..0c5c63d
--- /dev/null
+++ b/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td
@@ -0,0 +1,50 @@
+// RUN: llvm-tblgen -gen-sd-node-info -I %p/../../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+def MyTarget : Target;
+
+// CHECK: #ifdef GET_SDNODE_ENUM
+// CHECK-NEXT: #undef GET_SDNODE_ENUM
+// CHECK-EMPTY:
+// CHECK-NEXT: namespace llvm::MyTargetISD {
+// CHECK-EMPTY:
+// CHECK-NEXT: static constexpr unsigned GENERATED_OPCODE_END = ISD::BUILTIN_OP_END;
+// CHECK-EMPTY:
+// CHECK-NEXT: } // namespace llvm::MyTargetISD
+// CHECK-EMPTY:
+// CHECK-NEXT: #endif // GET_SDNODE_ENUM
+// CHECK-EMPTY:
+// CHECK-NEXT: #ifdef GET_SDNODE_DESC
+// CHECK-NEXT: #undef GET_SDNODE_DESC
+// CHECK-EMPTY:
+// CHECK-NEXT: namespace llvm {
+// CHECK-EMPTY:
+// CHECK-NEXT: #ifdef __GNUC__
+// CHECK-NEXT: #pragma GCC diagnostic push
+// CHECK-NEXT: #pragma GCC diagnostic ignored "-Woverlength-strings"
+// CHECK-NEXT: #endif
+// CHECK-NEXT: static constexpr char MyTargetSDNodeNamesStorage[] =
+// CHECK-NEXT: "\0"
+// CHECK-NEXT: ;
+// CHECK-NEXT: #ifdef __GNUC__
+// CHECK-NEXT: #pragma GCC diagnostic pop
+// CHECK-NEXT: #endif
+// CHECK-EMPTY:
+// CHECK-NEXT: static constexpr llvm::StringTable
+// CHECK-NEXT: MyTargetSDNodeNames = MyTargetSDNodeNamesStorage;
+// CHECK-EMPTY:
+// CHECK-NEXT: static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
+// CHECK-NEXT: /* dummy */ {SDTCisVT, 0, 0, MVT::INVALID_SIMPLE_VALUE_TYPE}
+// CHECK-NEXT: };
+// CHECK-EMPTY:
+// CHECK-NEXT: static const SDNodeDesc MyTargetSDNodeDescs[] = {
+// CHECK-NEXT: };
+// CHECK-EMPTY:
+// CHECK-NEXT: static const SDNodeInfo MyTargetGenSDNodeInfo(
+// CHECK-NEXT: /*NumOpcodes=*/0, MyTargetSDNodeDescs,
+// CHECK-NEXT: MyTargetSDNodeNames, MyTargetSDTypeConstraints);
+// CHECK-EMPTY:
+// CHECK-NEXT: } // namespace llvm
+// CHECK-EMPTY:
+// CHECK-NEXT: #endif // GET_SDNODE_DESC
diff --git a/llvm/test/TableGen/SDNodeInfoEmitter/trivial-node.td b/llvm/test/TableGen/SDNodeInfoEmitter/trivial-node.td
new file mode 100644
index 0000000..4bdc70a
--- /dev/null
+++ b/llvm/test/TableGen/SDNodeInfoEmitter/trivial-node.td
@@ -0,0 +1,34 @@
+// RUN: llvm-tblgen -gen-sd-node-info -I %p/../../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+def MyTarget : Target;
+
+def my_noop : SDNode<"MyTargetISD::NOOP", SDTypeProfile<0, 0, []>>;
+
+// CHECK: namespace llvm::MyTargetISD {
+// CHECK-EMPTY:
+// CHECK-NEXT: enum GenNodeType : unsigned {
+// CHECK-NEXT: NOOP = ISD::BUILTIN_OP_END,
+// CHECK-NEXT: };
+// CHECK-EMPTY:
+// CHECK-NEXT: static constexpr unsigned GENERATED_OPCODE_END = NOOP + 1;
+// CHECK-EMPTY:
+// CHECK-NEXT: } // namespace llvm::MyTargetISD
+
+// CHECK: static constexpr char MyTargetSDNodeNamesStorage[] =
+// CHECK-NEXT: "\0"
+// CHECK-NEXT: "MyTargetISD::NOOP\0"
+// CHECK-NEXT: ;
+
+// CHECK: static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
+// CHECK-NEXT: /* dummy */ {SDTCisVT, 0, 0, MVT::INVALID_SIMPLE_VALUE_TYPE}
+// CHECK-NEXT: };
+// CHECK-EMPTY:
+// CHECK-NEXT: static const SDNodeDesc MyTargetSDNodeDescs[] = {
+// CHECK-NEXT: {0, 0, 0, 0, 0, 1, 0, 0}, // NOOP
+// CHECK-NEXT: };
+// CHECK-EMPTY:
+// CHECK-NEXT: static const SDNodeInfo MyTargetGenSDNodeInfo(
+// CHECK-NEXT: /*NumOpcodes=*/1, MyTargetSDNodeDescs,
+// CHECK-NEXT: MyTargetSDNodeNames, MyTargetSDTypeConstraints);
diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index 72d282f..c5eedb2 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -143,13 +143,14 @@ attributes #0 = { noinline optnone }
!12 = !{i64 789, i64 300}
!13 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !14, producer: "clang version 21.0.0git (git@github.com:llvm/llvm-project.git e391301e0e4d9183fe06e69602e87b0bc889aeda)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
!14 = !DIFile(filename: "basic.cc", directory: "", checksumkind: CSK_MD5, checksum: "8636c46e81402013b9d54e8307d2f149")
-!15 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !13)
+!15 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !13, declaration: !22)
!16 = !DISubroutineType(types: !17)
!17 = !{!18}
!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
!19 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
!20 = !{i32 7, !"Dwarf Version", i32 5}
!21 = !{i32 2, !"Debug Info Version", i32 3}
+!22 = !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
; DUMP: CCG before cloning:
; DUMP: Callsite Context Graph:
@@ -321,7 +322,8 @@ attributes #0 = { noinline optnone }
; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
; IR: attributes #[[COLD]] = { "memprof"="cold" }
;; Make sure the clone's linkageName was updated.
-; IR: ![[SP]] = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1"
+; IR: ![[SP]] = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1", {{.*}} declaration: ![[SP2:[0-9]+]])
+; IR: ![[SP2]] = !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1"
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
diff --git a/llvm/test/ThinLTO/X86/memprof-icp.ll b/llvm/test/ThinLTO/X86/memprof-icp.ll
index dbc532e..3a68cd8 100644
--- a/llvm/test/ThinLTO/X86/memprof-icp.ll
+++ b/llvm/test/ThinLTO/X86/memprof-icp.ll
@@ -229,6 +229,7 @@
; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
; RUN: -import-instr-limit=0 \
; RUN: -memprof-require-definition-for-promotion \
+; RUN: -icp-allow-decls=false \
; RUN: -enable-memprof-indirect-call-support=true \
; RUN: -supports-hot-cold-new \
; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll
new file mode 100644
index 0000000..34f3924
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu -data-layout="E-n64" < %s | FileCheck %s
+
+; Pretend X86 is big endian.
+
+; FIXME: Big endian not supported yet.
+
+define void @test_i32_be(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_be(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
+; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT: store i8 [[X_0]], ptr [[GEP_0]], align 1
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8
+; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 24
+; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[TMP1]] to i8
+; CHECK-NEXT: store i8 [[X_3]], ptr [[P]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i8
+ %gep.0 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.0, ptr %gep.0
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 2
+ store i8 %x.1, ptr %gep.1
+ %shr.2 = lshr i32 %x, 16
+ %x.2 = trunc i32 %shr.2 to i8
+ %gep.2 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.2, ptr %gep.2
+ %shr.3 = lshr i32 %x, 24
+ %x.3 = trunc i32 %shr.3 to i8
+ store i8 %x.3, ptr %p
+ ret void
+}
+
+define void @test_i32_le(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_le(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
+; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8
+; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24
+; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
+; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT: store i8 [[X_3]], ptr [[GEP_3]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ %shr.2 = lshr i32 %x, 16
+ %x.2 = trunc i32 %shr.2 to i8
+ %gep.2 = getelementptr i8, ptr %p, i64 2
+ store i8 %x.2, ptr %gep.2
+ %shr.3 = lshr i32 %x, 24
+ %x.3 = trunc i32 %shr.3 to i8
+ %gep.3 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.3, ptr %gep.3
+ ret void
+}
+
+define void @test_i32_mixed_parts(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_mixed_parts(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
+; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT: store i8 [[X_0]], ptr [[GEP_0]], align 1
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT: store i16 [[X_1]], ptr [[GEP_1]], align 2
+; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24
+; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
+; CHECK-NEXT: store i8 [[X_3]], ptr [[P]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i8
+ %gep.0 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.0, ptr %gep.0
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i16
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i16 %x.1, ptr %gep.1
+ %shr.3 = lshr i32 %x, 24
+ %x.3 = trunc i32 %shr.3 to i8
+ store i8 %x.3, ptr %p
+ ret void
+}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll
new file mode 100644
index 0000000..56786d0
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll
@@ -0,0 +1,901 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+declare void @use.i16(i16)
+declare void @use.i32(i32)
+
+define void @test_i16(i16 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i16(
+; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: store i16 [[X]], ptr [[P]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i16 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i16 %x, 8
+ %x.1 = trunc i16 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_i32_i8_parts(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_i8_parts(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ %shr.2 = lshr i32 %x, 16
+ %x.2 = trunc i32 %shr.2 to i8
+ %gep.2 = getelementptr i8, ptr %p, i64 2
+ store i8 %x.2, ptr %gep.2
+ %shr.3 = lshr i32 %x, 24
+ %x.3 = trunc i32 %shr.3 to i8
+ %gep.3 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.3, ptr %gep.3
+ ret void
+}
+
+define void @test_i32_i16_parts(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_i16_parts(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i16
+ store i16 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 16
+ %x.1 = trunc i32 %shr.1 to i16
+ %gep.1 = getelementptr i8, ptr %p, i64 2
+ store i16 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_i32_mixed_parts(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_mixed_parts(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i16
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i16 %x.1, ptr %gep.1
+ %shr.3 = lshr i32 %x, 24
+ %x.3 = trunc i32 %shr.3 to i8
+ %gep.3 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.3, ptr %gep.3
+ ret void
+}
+
+define void @test_i64(i64 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i64(
+; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: store i64 [[X]], ptr [[P]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i64 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i64 %x, 8
+ %x.1 = trunc i64 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ %shr.2 = lshr i64 %x, 16
+ %x.2 = trunc i64 %shr.2 to i8
+ %gep.2 = getelementptr i8, ptr %p, i64 2
+ store i8 %x.2, ptr %gep.2
+ %shr.3 = lshr i64 %x, 24
+ %x.3 = trunc i64 %shr.3 to i8
+ %gep.3 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.3, ptr %gep.3
+ %shr.4 = lshr i64 %x, 32
+ %x.4 = trunc i64 %shr.4 to i8
+ %gep.4 = getelementptr i8, ptr %p, i64 4
+ store i8 %x.4, ptr %gep.4
+ %shr.5 = lshr i64 %x, 40
+ %x.5 = trunc i64 %shr.5 to i8
+ %gep.5 = getelementptr i8, ptr %p, i64 5
+ store i8 %x.5, ptr %gep.5
+ %shr.6 = lshr i64 %x, 48
+ %x.6 = trunc i64 %shr.6 to i8
+ %gep.6 = getelementptr i8, ptr %p, i64 6
+ store i8 %x.6, ptr %gep.6
+ %shr.7 = lshr i64 %x, 56
+ %x.7 = trunc i64 %shr.7 to i8
+ %gep.7 = getelementptr i8, ptr %p, i64 7
+ store i8 %x.7, ptr %gep.7
+ ret void
+}
+
+define void @test_i128(i128 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i128(
+; CHECK-SAME: i128 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_0:%.*]] = trunc i128 [[X]] to i8
+; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i128 [[X]], 8
+; CHECK-NEXT: [[X_1:%.*]] = trunc i128 [[SHR_1]] to i8
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT: [[SHR_2:%.*]] = lshr i128 [[X]], 16
+; CHECK-NEXT: [[X_2:%.*]] = trunc i128 [[SHR_2]] to i8
+; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT: [[SHR_3:%.*]] = lshr i128 [[X]], 24
+; CHECK-NEXT: [[X_3:%.*]] = trunc i128 [[SHR_3]] to i8
+; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT: store i8 [[X_3]], ptr [[GEP_3]], align 1
+; CHECK-NEXT: [[SHR_4:%.*]] = lshr i128 [[X]], 32
+; CHECK-NEXT: [[X_4:%.*]] = trunc i128 [[SHR_4]] to i8
+; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr [[P]], i64 4
+; CHECK-NEXT: store i8 [[X_4]], ptr [[GEP_4]], align 1
+; CHECK-NEXT: [[SHR_5:%.*]] = lshr i128 [[X]], 40
+; CHECK-NEXT: [[X_5:%.*]] = trunc i128 [[SHR_5]] to i8
+; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr i8, ptr [[P]], i64 5
+; CHECK-NEXT: store i8 [[X_5]], ptr [[GEP_5]], align 1
+; CHECK-NEXT: [[SHR_6:%.*]] = lshr i128 [[X]], 48
+; CHECK-NEXT: [[X_6:%.*]] = trunc i128 [[SHR_6]] to i8
+; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr i8, ptr [[P]], i64 6
+; CHECK-NEXT: store i8 [[X_6]], ptr [[GEP_6]], align 1
+; CHECK-NEXT: [[SHR_7:%.*]] = lshr i128 [[X]], 56
+; CHECK-NEXT: [[X_7:%.*]] = trunc i128 [[SHR_7]] to i8
+; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr i8, ptr [[P]], i64 7
+; CHECK-NEXT: store i8 [[X_7]], ptr [[GEP_7]], align 1
+; CHECK-NEXT: [[SHR_8:%.*]] = lshr i128 [[X]], 64
+; CHECK-NEXT: [[X_8:%.*]] = trunc i128 [[SHR_8]] to i8
+; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i8 [[X_8]], ptr [[GEP_8]], align 1
+; CHECK-NEXT: [[SHR_9:%.*]] = lshr i128 [[X]], 72
+; CHECK-NEXT: [[X_9:%.*]] = trunc i128 [[SHR_9]] to i8
+; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr i8, ptr [[P]], i64 9
+; CHECK-NEXT: store i8 [[X_9]], ptr [[GEP_9]], align 1
+; CHECK-NEXT: [[SHR_10:%.*]] = lshr i128 [[X]], 80
+; CHECK-NEXT: [[X_10:%.*]] = trunc i128 [[SHR_10]] to i8
+; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr i8, ptr [[P]], i64 10
+; CHECK-NEXT: store i8 [[X_10]], ptr [[GEP_10]], align 1
+; CHECK-NEXT: [[SHR_11:%.*]] = lshr i128 [[X]], 88
+; CHECK-NEXT: [[X_11:%.*]] = trunc i128 [[SHR_11]] to i8
+; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr i8, ptr [[P]], i64 11
+; CHECK-NEXT: store i8 [[X_11]], ptr [[GEP_11]], align 1
+; CHECK-NEXT: [[SHR_12:%.*]] = lshr i128 [[X]], 96
+; CHECK-NEXT: [[X_12:%.*]] = trunc i128 [[SHR_12]] to i8
+; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr i8, ptr [[P]], i64 12
+; CHECK-NEXT: store i8 [[X_12]], ptr [[GEP_12]], align 1
+; CHECK-NEXT: [[SHR_13:%.*]] = lshr i128 [[X]], 104
+; CHECK-NEXT: [[X_13:%.*]] = trunc i128 [[SHR_13]] to i8
+; CHECK-NEXT: [[GEP_13:%.*]] = getelementptr i8, ptr [[P]], i64 13
+; CHECK-NEXT: store i8 [[X_13]], ptr [[GEP_13]], align 1
+; CHECK-NEXT: [[SHR_14:%.*]] = lshr i128 [[X]], 112
+; CHECK-NEXT: [[X_14:%.*]] = trunc i128 [[SHR_14]] to i8
+; CHECK-NEXT: [[GEP_14:%.*]] = getelementptr i8, ptr [[P]], i64 14
+; CHECK-NEXT: store i8 [[X_14]], ptr [[GEP_14]], align 1
+; CHECK-NEXT: [[SHR_15:%.*]] = lshr i128 [[X]], 120
+; CHECK-NEXT: [[X_15:%.*]] = trunc i128 [[SHR_15]] to i8
+; CHECK-NEXT: [[GEP_15:%.*]] = getelementptr i8, ptr [[P]], i64 15
+; CHECK-NEXT: store i8 [[X_15]], ptr [[GEP_15]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i128 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i128 %x, 8
+ %x.1 = trunc i128 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ %shr.2 = lshr i128 %x, 16
+ %x.2 = trunc i128 %shr.2 to i8
+ %gep.2 = getelementptr i8, ptr %p, i64 2
+ store i8 %x.2, ptr %gep.2
+ %shr.3 = lshr i128 %x, 24
+ %x.3 = trunc i128 %shr.3 to i8
+ %gep.3 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.3, ptr %gep.3
+ %shr.4 = lshr i128 %x, 32
+ %x.4 = trunc i128 %shr.4 to i8
+ %gep.4 = getelementptr i8, ptr %p, i64 4
+ store i8 %x.4, ptr %gep.4
+ %shr.5 = lshr i128 %x, 40
+ %x.5 = trunc i128 %shr.5 to i8
+ %gep.5 = getelementptr i8, ptr %p, i64 5
+ store i8 %x.5, ptr %gep.5
+ %shr.6 = lshr i128 %x, 48
+ %x.6 = trunc i128 %shr.6 to i8
+ %gep.6 = getelementptr i8, ptr %p, i64 6
+ store i8 %x.6, ptr %gep.6
+ %shr.7 = lshr i128 %x, 56
+ %x.7 = trunc i128 %shr.7 to i8
+ %gep.7 = getelementptr i8, ptr %p, i64 7
+ store i8 %x.7, ptr %gep.7
+ %shr.8 = lshr i128 %x, 64
+ %x.8 = trunc i128 %shr.8 to i8
+ %gep.8 = getelementptr i8, ptr %p, i64 8
+ store i8 %x.8, ptr %gep.8
+ %shr.9 = lshr i128 %x, 72
+ %x.9 = trunc i128 %shr.9 to i8
+ %gep.9 = getelementptr i8, ptr %p, i64 9
+ store i8 %x.9, ptr %gep.9
+ %shr.10 = lshr i128 %x, 80
+ %x.10 = trunc i128 %shr.10 to i8
+ %gep.10 = getelementptr i8, ptr %p, i64 10
+ store i8 %x.10, ptr %gep.10
+ %shr.11 = lshr i128 %x, 88
+ %x.11 = trunc i128 %shr.11 to i8
+ %gep.11 = getelementptr i8, ptr %p, i64 11
+ store i8 %x.11, ptr %gep.11
+ %shr.12 = lshr i128 %x, 96
+ %x.12 = trunc i128 %shr.12 to i8
+ %gep.12 = getelementptr i8, ptr %p, i64 12
+ store i8 %x.12, ptr %gep.12
+ %shr.13 = lshr i128 %x, 104
+ %x.13 = trunc i128 %shr.13 to i8
+ %gep.13 = getelementptr i8, ptr %p, i64 13
+ store i8 %x.13, ptr %gep.13
+ %shr.14 = lshr i128 %x, 112
+ %x.14 = trunc i128 %shr.14 to i8
+ %gep.14 = getelementptr i8, ptr %p, i64 14
+ store i8 %x.14, ptr %gep.14
+ %shr.15 = lshr i128 %x, 120
+ %x.15 = trunc i128 %shr.15 to i8
+ %gep.15 = getelementptr i8, ptr %p, i64 15
+ store i8 %x.15, ptr %gep.15
+ ret void
+}
+
+define void @test_i32_lo(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_lo(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_i32_hi(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_hi(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_0]] to i16
+; CHECK-NEXT: store i16 [[TMP2]], ptr [[P]], align 1
+; CHECK-NEXT: ret void
+;
+ %shr.0 = lshr i32 %x, 16
+ %x.0 = trunc i32 %shr.0 to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 24
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_i32_mid(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_mid(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 10
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT: store i16 [[TMP2]], ptr [[P]], align 1
+; CHECK-NEXT: ret void
+;
+ %shr.0 = lshr i32 %x, 10
+ %x.0 = trunc i32 %shr.0 to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 18
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_i32_shift_in_zeros(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_shift_in_zeros(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 20
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_0]] to i16
+; CHECK-NEXT: store i16 [[TMP2]], ptr [[P]], align 1
+; CHECK-NEXT: ret void
+;
+ %shr.0 = lshr i32 %x, 20
+ %x.0 = trunc i32 %shr.0 to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 28
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_base_ptr_with_offset(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_base_ptr_with_offset(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P]], i64 7
+; CHECK-NEXT: store i32 [[X]], ptr [[TMP1]], align 2
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i16
+ %gep.0 = getelementptr i8, ptr %p, i64 7
+ store i16 %x.0, ptr %gep.0
+ %shr.1 = lshr i32 %x, 16
+ %x.1 = trunc i32 %shr.1 to i16
+ %gep.1 = getelementptr i8, ptr %p, i64 9
+ store i16 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_aliasing_store(i16 %x, ptr %p, ptr %p2) {
+; CHECK-LABEL: define void @test_aliasing_store(
+; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT: [[X_0:%.*]] = trunc i16 [[X]] to i8
+; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT: store i8 0, ptr [[P2]], align 1
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i16 [[X]], 8
+; CHECK-NEXT: [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i16 %x to i8
+ store i8 %x.0, ptr %p
+ store i8 0, ptr %p2
+ %shr.1 = lshr i16 %x, 8
+ %x.1 = trunc i16 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_non_aliasing_store(i16 %x, ptr noalias %p, ptr noalias %p2) {
+; CHECK-LABEL: define void @test_non_aliasing_store(
+; CHECK-SAME: i16 [[X:%.*]], ptr noalias [[P:%.*]], ptr noalias [[P2:%.*]]) {
+; CHECK-NEXT: store i16 [[X]], ptr [[P]], align 1
+; CHECK-NEXT: store i8 0, ptr [[P2]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i16 %x to i8
+ store i8 %x.0, ptr %p
+ store i8 0, ptr %p2
+ %shr.1 = lshr i16 %x, 8
+ %x.1 = trunc i16 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret void
+}
+
+define i8 @test_aliasing_load(i16 %x, ptr %p, ptr %p2) {
+; CHECK-LABEL: define i8 @test_aliasing_load(
+; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT: [[X_0:%.*]] = trunc i16 [[X]] to i8
+; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT: [[V:%.*]] = load i8, ptr [[P2]], align 1
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i16 [[X]], 8
+; CHECK-NEXT: [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT: ret i8 [[V]]
+;
+ %x.0 = trunc i16 %x to i8
+ store i8 %x.0, ptr %p
+ %v = load i8, ptr %p2
+ %shr.1 = lshr i16 %x, 8
+ %x.1 = trunc i16 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret i8 %v
+}
+
+define i8 @test_non_aliasing_load(i16 %x, ptr noalias %p, ptr noalias %p2) {
+; CHECK-LABEL: define i8 @test_non_aliasing_load(
+; CHECK-SAME: i16 [[X:%.*]], ptr noalias [[P:%.*]], ptr noalias [[P2:%.*]]) {
+; CHECK-NEXT: store i16 [[X]], ptr [[P]], align 1
+; CHECK-NEXT: [[V:%.*]] = load i8, ptr [[P2]], align 1
+; CHECK-NEXT: ret i8 [[V]]
+;
+ %x.0 = trunc i16 %x to i8
+ store i8 %x.0, ptr %p
+ %v = load i8, ptr %p2
+ %shr.1 = lshr i16 %x, 8
+ %x.1 = trunc i16 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret i8 %v
+}
+
+define i8 @test_aliasing_load_partially_mergeable(i32 %x, ptr %p, ptr %p2) {
+; CHECK-LABEL: define i8 @test_aliasing_load_partially_mergeable(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT: [[V:%.*]] = load i8, ptr [[P2]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT: store i16 [[TMP3]], ptr [[TMP4]], align 1
+; CHECK-NEXT: ret i8 [[V]]
+;
+ %x.0 = trunc i32 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ %v = load i8, ptr %p2
+ %shr.2 = lshr i32 %x, 16
+ %x.2 = trunc i32 %shr.2 to i8
+ %gep.2 = getelementptr i8, ptr %p, i64 2
+ store i8 %x.2, ptr %gep.2
+ %shr.3 = lshr i32 %x, 24
+ %x.3 = trunc i32 %shr.3 to i8
+ %gep.3 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.3, ptr %gep.3
+ ret i8 %v
+}
+
+declare void @may_unwind() memory(none)
+
+define void @test_unwind(i16 %x, ptr %p, ptr %p2) {
+; CHECK-LABEL: define void @test_unwind(
+; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT: [[X_0:%.*]] = trunc i16 [[X]] to i8
+; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT: call void @may_unwind()
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i16 [[X]], 8
+; CHECK-NEXT: [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i16 %x to i8
+ store i8 %x.0, ptr %p
+ call void @may_unwind()
+ %shr.1 = lshr i16 %x, 8
+ %x.1 = trunc i16 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_multi_group(i16 %x, ptr %p1, i16 %y, ptr %p2) {
+; CHECK-LABEL: define void @test_multi_group(
+; CHECK-SAME: i16 [[X:%.*]], ptr [[P1:%.*]], i16 [[Y:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT: store i16 [[X]], ptr [[P1]], align 1
+; CHECK-NEXT: call void @may_unwind()
+; CHECK-NEXT: store i16 [[Y]], ptr [[P2]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i16 %x to i8
+ store i8 %x.0, ptr %p1
+ %shr.1 = lshr i16 %x, 8
+ %x.1 = trunc i16 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p1, i64 1
+ store i8 %x.1, ptr %gep.1
+ call void @may_unwind()
+ %y.0 = trunc i16 %y to i8
+ store i8 %y.0, ptr %p2
+ %shr.2 = lshr i16 %y, 8
+ %y.1 = trunc i16 %shr.2 to i8
+ %gep.2 = getelementptr i8, ptr %p2, i64 1
+ store i8 %y.1, ptr %gep.2
+ ret void
+}
+
+define void @test_stores_out_of_order(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_stores_out_of_order(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 1
+; CHECK-NEXT: ret void
+;
+ %shr.2 = lshr i32 %x, 16
+ %x.2 = trunc i32 %shr.2 to i8
+ %gep.2 = getelementptr i8, ptr %p, i64 2
+ store i8 %x.2, ptr %gep.2
+ %x.0 = trunc i32 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.3 = lshr i32 %x, 24
+ %x.3 = trunc i32 %shr.3 to i8
+ %gep.3 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.3, ptr %gep.3
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_i32_gap(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_gap(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 7
+; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[SHR_0]] to i8
+; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT: ret void
+;
+ %shr.0 = lshr i32 %x, 7
+ %x.0 = trunc i32 %shr.0 to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 16
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_i32_non_byte_sized(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_non_byte_sized(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i15
+; CHECK-NEXT: store i15 [[X_0]], ptr [[P]], align 2
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 15
+; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i17
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT: store i17 [[X_1]], ptr [[GEP_1]], align 4
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i15
+ store i15 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 15
+ %x.1 = trunc i32 %shr.1 to i17
+ %gep.1 = getelementptr i8, ptr %p, i64 2
+ store i17 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_i32_wrong_ptr_offset(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_wrong_ptr_offset(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[SHR_0]] to i8
+; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT: ret void
+;
+ %shr.0 = lshr i32 %x, 8
+ %x.0 = trunc i32 %shr.0 to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 16
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 2
+ store i8 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_i32_wrong_endian(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_wrong_endian(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
+; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT: store i8 [[X_0]], ptr [[GEP_0]], align 1
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8
+; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24
+; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
+; CHECK-NEXT: store i8 [[X_3]], ptr [[P]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i8
+ %gep.0 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.0, ptr %gep.0
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 2
+ store i8 %x.1, ptr %gep.1
+ %shr.2 = lshr i32 %x, 16
+ %x.2 = trunc i32 %shr.2 to i8
+ %gep.2 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.2, ptr %gep.2
+ %shr.3 = lshr i32 %x, 24
+ %x.3 = trunc i32 %shr.3 to i8
+ store i8 %x.3, ptr %p
+ ret void
+}
+
+define void @test_i32_volatile(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_volatile(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[TMP1]] to i8
+; CHECK-NEXT: store volatile i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT: ret void
+;
+ %shr.0 = lshr i32 %x, 8
+ %x.0 = trunc i32 %shr.0 to i8
+ store volatile i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 16
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_i32_atomic(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_atomic(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[SHR_0]] to i8
+; CHECK-NEXT: store atomic i8 [[X_0]], ptr [[P]] monotonic, align 1
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT: ret void
+;
+ %shr.0 = lshr i32 %x, 8
+ %x.0 = trunc i32 %shr.0 to i8
+ store atomic i8 %x.0, ptr %p monotonic, align 1
+ %shr.1 = lshr i32 %x, 16
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ ret void
+}
+
+define void @test_i32_multiple_pointers(i32 %x, i32 %y, ptr %p, ptr %p2) {
+; CHECK-LABEL: define void @test_i32_multiple_pointers(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2
+; CHECK-NEXT: store i32 [[Y]], ptr [[P2]], align 2
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i16
+ store i16 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 16
+ %x.1 = trunc i32 %shr.1 to i16
+ %gep.1 = getelementptr i8, ptr %p, i64 2
+ store i16 %x.1, ptr %gep.1
+
+ %y.0 = trunc i32 %y to i16
+ store i16 %y.0, ptr %p2
+ %y.shr.1 = lshr i32 %y, 16
+ %y.1 = trunc i32 %y.shr.1 to i16
+ %p2.gep.1 = getelementptr i8, ptr %p2, i64 2
+ store i16 %y.1, ptr %p2.gep.1
+ ret void
+}
+
+define void @test_i32_multiple_pointers_interleaved(i32 %x, i32 %y, ptr noalias %p, ptr noalias %p2) {
+; CHECK-LABEL: define void @test_i32_multiple_pointers_interleaved(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr noalias [[P:%.*]], ptr noalias [[P2:%.*]]) {
+; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT: store i16 [[X_0]], ptr [[P]], align 2
+; CHECK-NEXT: [[Y_0:%.*]] = trunc i32 [[Y]] to i16
+; CHECK-NEXT: store i16 [[Y_0]], ptr [[P2]], align 2
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT: store i16 [[X_1]], ptr [[GEP_1]], align 2
+; CHECK-NEXT: [[Y_SHR_1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[Y_1:%.*]] = trunc i32 [[Y_SHR_1]] to i16
+; CHECK-NEXT: [[P2_GEP_1:%.*]] = getelementptr i8, ptr [[P2]], i64 2
+; CHECK-NEXT: store i16 [[Y_1]], ptr [[P2_GEP_1]], align 2
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i16
+ store i16 %x.0, ptr %p
+ %y.0 = trunc i32 %y to i16
+ store i16 %y.0, ptr %p2
+
+ %shr.1 = lshr i32 %x, 16
+ %x.1 = trunc i32 %shr.1 to i16
+ %gep.1 = getelementptr i8, ptr %p, i64 2
+ store i16 %x.1, ptr %gep.1
+ %y.shr.1 = lshr i32 %y, 16
+ %y.1 = trunc i32 %y.shr.1 to i16
+ %p2.gep.1 = getelementptr i8, ptr %p2, i64 2
+ store i16 %y.1, ptr %p2.gep.1
+ ret void
+}
+
+define void @test_i32_multi_use(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_multi_use(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2
+; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16
+; CHECK-NEXT: call void @use.i16(i16 [[X_0]])
+; CHECK-NEXT: call void @use.i16(i16 [[X_1]])
+; CHECK-NEXT: call void @use.i32(i32 [[SHR_1]])
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i16
+ store i16 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 16
+ %x.1 = trunc i32 %shr.1 to i16
+ %gep.1 = getelementptr i8, ptr %p, i64 2
+ store i16 %x.1, ptr %gep.1
+ call void @use.i16(i16 %x.0)
+ call void @use.i16(i16 %x.1)
+ call void @use.i32(i32 %shr.1)
+ ret void
+}
+
+define void @test_i32_scoped_aa_same(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_scoped_aa_same(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2, !noalias [[META0:![0-9]+]]
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i16
+ store i16 %x.0, ptr %p, !noalias !0
+ %shr.1 = lshr i32 %x, 16
+ %x.1 = trunc i32 %shr.1 to i16
+ %gep.1 = getelementptr i8, ptr %p, i64 2
+ store i16 %x.1, ptr %gep.1, !noalias !0
+ ret void
+}
+
+define void @test_i32_scoped_aa_different(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_scoped_aa_different(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2, !noalias [[META3:![0-9]+]]
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i16
+ store i16 %x.0, ptr %p, !noalias !0
+ %shr.1 = lshr i32 %x, 16
+ %x.1 = trunc i32 %shr.1 to i16
+ %gep.1 = getelementptr i8, ptr %p, i64 2
+ store i16 %x.1, ptr %gep.1, !noalias !3
+ ret void
+}
+
+define void @test_i32_tbaa(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_tbaa(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i16
+ store i16 %x.0, ptr %p, !tbaa !6
+ %shr.1 = lshr i32 %x, 16
+ %x.1 = trunc i32 %shr.1 to i16
+ %gep.1 = getelementptr i8, ptr %p, i64 2
+ store i16 %x.1, ptr %gep.1, !tbaa !6
+ ret void
+}
+
+define void @test_multiple_parts_with_gap1(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_multiple_parts_with_gap1(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24
+; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
+; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT: store i8 [[X_3]], ptr [[GEP_3]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ %shr.3 = lshr i32 %x, 24
+ %x.3 = trunc i32 %shr.3 to i8
+ %gep.3 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.3, ptr %gep.3
+ ret void
+}
+
+define void @test_multiple_parts_with_gap2(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_multiple_parts_with_gap2(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
+; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT: store i16 [[TMP2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.2 = lshr i32 %x, 16
+ %x.2 = trunc i32 %shr.2 to i8
+ %gep.2 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.2, ptr %gep.2
+ %shr.3 = lshr i32 %x, 24
+ %x.3 = trunc i32 %shr.3 to i8
+ %gep.3 = getelementptr i8, ptr %p, i64 2
+ store i8 %x.3, ptr %gep.3
+ ret void
+}
+
+define void @test_multiple_parts_with_gap3(i64 %x, ptr %p) {
+; CHECK-LABEL: define void @test_multiple_parts_with_gap3(
+; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[X]] to i16
+; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[X]], 24
+; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i16
+; CHECK-NEXT: store i16 [[TMP3]], ptr [[GEP_3]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i64 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i64 %x, 8
+ %x.1 = trunc i64 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ %shr.3 = lshr i64 %x, 24
+ %x.3 = trunc i64 %shr.3 to i8
+ %gep.3 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.3, ptr %gep.3
+ %shr.4 = lshr i64 %x, 32
+ %x.4 = trunc i64 %shr.4 to i8
+ %gep.4 = getelementptr i8, ptr %p, i64 4
+ store i8 %x.4, ptr %gep.4
+ ret void
+}
+
+define void @test_store_same_parts_twice(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_store_same_parts_twice(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT: store i16 [[TMP2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ %gep.2 = getelementptr i8, ptr %p, i64 2
+ store i8 %x.0, ptr %gep.2
+ %gep.3 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.1, ptr %gep.3
+ ret void
+}
+
+!0 = !{!1}
+!1 = !{!1, !2}
+!2 = !{!2}
+
+!3 = !{!4}
+!4 = !{!4, !5}
+!5 = !{!5}
+
+!6 = !{!7, !7, i64 0}
+!7 = !{!"short", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]]}
+; CHECK: [[META3]] = !{}
+;.
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll
index fad4acb..6719290 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll
@@ -393,26 +393,6 @@ bb:
ret i32 %i2
}
-define i32 @test_lifetime() {
-; CHECK-LABEL: define {{[^@]+}}@test_lifetime() {
-; CHECK-NEXT: bb:
-; CHECK-NEXT: [[I_H2S:%.*]] = alloca i8, i64 4, align 1
-; CHECK-NEXT: tail call void @no_sync_func(ptr noalias nofree captures(none) [[I_H2S]])
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 noundef 4, ptr noalias nofree nonnull align 4 captures(none) dereferenceable(4) [[I_H2S]])
-; CHECK-NEXT: store i32 10, ptr [[I_H2S]], align 4
-; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[I_H2S]], align 4
-; CHECK-NEXT: ret i32 [[I2]]
-;
-bb:
- %i = tail call noalias ptr @malloc(i64 4)
- tail call void @no_sync_func(ptr %i)
- call void @llvm.lifetime.start.p0(i64 4, ptr %i)
- store i32 10, ptr %i, align 4
- %i2 = load i32, ptr %i, align 4
- tail call void @free(ptr %i)
- ret i32 %i2
-}
-
; TEST 11
define void @test11() {
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
index c7a9ec8..0be9434 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
@@ -340,27 +340,6 @@ bb:
ret i32 %i2
}
-define i32 @test_lifetime() {
-; CHECK-LABEL: define {{[^@]+}}@test_lifetime() {
-; CHECK-NEXT: bb:
-; CHECK-NEXT: [[I:%.*]] = tail call noalias ptr @malloc(i64 noundef 4)
-; CHECK-NEXT: tail call void @no_sync_func(ptr noalias nofree captures(none) [[I]])
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 noundef 4, ptr noalias nofree nonnull align 4 captures(none) dereferenceable(4) [[I]])
-; CHECK-NEXT: store i32 10, ptr [[I]], align 4
-; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[I]], align 4
-; CHECK-NEXT: tail call void @free(ptr noalias nonnull align 4 captures(none) dereferenceable(4) [[I]])
-; CHECK-NEXT: ret i32 [[I2]]
-;
-bb:
- %i = tail call noalias ptr @malloc(i64 4)
- tail call void @no_sync_func(ptr %i)
- call void @llvm.lifetime.start.p0(i64 4, ptr %i)
- store i32 10, ptr %i, align 4
- %i2 = load i32, ptr %i, align 4
- tail call void @free(ptr %i)
- ret i32 %i2
-}
-
; TEST 11
define void @test11() {
diff --git a/llvm/test/Transforms/Attributor/memory_locations.ll b/llvm/test/Transforms/Attributor/memory_locations.ll
index 9c27fca..936b8a0 100644
--- a/llvm/test/Transforms/Attributor/memory_locations.ll
+++ b/llvm/test/Transforms/Attributor/memory_locations.ll
@@ -300,7 +300,6 @@ entry:
declare ptr @unknown_ptr() readnone
declare ptr @argmem_only(ptr %arg) argmemonly
declare ptr @inaccesible_argmem_only_decl(ptr %arg) inaccessiblemem_or_argmemonly
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) nounwind argmemonly willreturn
define void @callerA1(ptr %arg) {
; CHECK: Function Attrs: memory(argmem: readwrite)
@@ -387,21 +386,10 @@ define void @callerD2() {
ret void
}
-define void @callerE(ptr %arg) {
-; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@callerE
-; CHECK-SAME: (ptr nofree readnone captures(none) [[ARG:%.*]]) #[[ATTR5:[0-9]+]] {
-; CHECK-NEXT: ret void
-;
- call void @llvm.lifetime.start.p0(i64 4, ptr %arg)
- ret void
-}
-
-
define void @write_global() {
; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
; CHECK-LABEL: define {{[^@]+}}@write_global
-; CHECK-SAME: () #[[ATTR6:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
; CHECK-NEXT: store i32 0, ptr @G, align 4
; CHECK-NEXT: ret void
;
@@ -411,7 +399,7 @@ define void @write_global() {
define void @write_global_via_arg(ptr %GPtr) {
; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
; CHECK-LABEL: define {{[^@]+}}@write_global_via_arg
-; CHECK-SAME: (ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[GPTR:%.*]]) #[[ATTR7:[0-9]+]] {
+; CHECK-SAME: (ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[GPTR:%.*]]) #[[ATTR5:[0-9]+]] {
; CHECK-NEXT: store i32 0, ptr [[GPTR]], align 4
; CHECK-NEXT: ret void
;
@@ -421,7 +409,7 @@ define void @write_global_via_arg(ptr %GPtr) {
define internal void @write_global_via_arg_internal(ptr %GPtr) {
; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none)
; CHECK-LABEL: define {{[^@]+}}@write_global_via_arg_internal
-; CHECK-SAME: () #[[ATTR8:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR6:[0-9]+]] {
; CHECK-NEXT: store i32 0, ptr @G, align 4
; CHECK-NEXT: ret void
;
@@ -432,14 +420,14 @@ define internal void @write_global_via_arg_internal(ptr %GPtr) {
define void @writeonly_global() {
; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
; TUNIT-LABEL: define {{[^@]+}}@writeonly_global
-; TUNIT-SAME: () #[[ATTR6]] {
-; TUNIT-NEXT: call void @write_global() #[[ATTR12:[0-9]+]]
+; TUNIT-SAME: () #[[ATTR4]] {
+; TUNIT-NEXT: call void @write_global() #[[ATTR10:[0-9]+]]
; TUNIT-NEXT: ret void
;
; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write)
; CGSCC-LABEL: define {{[^@]+}}@writeonly_global
-; CGSCC-SAME: () #[[ATTR9:[0-9]+]] {
-; CGSCC-NEXT: call void @write_global() #[[ATTR13:[0-9]+]]
+; CGSCC-SAME: () #[[ATTR7:[0-9]+]] {
+; CGSCC-NEXT: call void @write_global() #[[ATTR11:[0-9]+]]
; CGSCC-NEXT: ret void
;
call void @write_global()
@@ -448,14 +436,14 @@ define void @writeonly_global() {
define void @writeonly_global_via_arg() {
; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
; TUNIT-LABEL: define {{[^@]+}}@writeonly_global_via_arg
-; TUNIT-SAME: () #[[ATTR6]] {
-; TUNIT-NEXT: call void @write_global_via_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) @G) #[[ATTR12]]
+; TUNIT-SAME: () #[[ATTR4]] {
+; TUNIT-NEXT: call void @write_global_via_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) @G) #[[ATTR10]]
; TUNIT-NEXT: ret void
;
; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write)
; CGSCC-LABEL: define {{[^@]+}}@writeonly_global_via_arg
-; CGSCC-SAME: () #[[ATTR9]] {
-; CGSCC-NEXT: call void @write_global_via_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) @G) #[[ATTR13]]
+; CGSCC-SAME: () #[[ATTR7]] {
+; CGSCC-NEXT: call void @write_global_via_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) @G) #[[ATTR11]]
; CGSCC-NEXT: ret void
;
call void @write_global_via_arg(ptr @G)
@@ -466,14 +454,14 @@ define void @writeonly_global_via_arg_internal() {
;
; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
; TUNIT-LABEL: define {{[^@]+}}@writeonly_global_via_arg_internal
-; TUNIT-SAME: () #[[ATTR6]] {
-; TUNIT-NEXT: call void @write_global_via_arg_internal() #[[ATTR12]]
+; TUNIT-SAME: () #[[ATTR4]] {
+; TUNIT-NEXT: call void @write_global_via_arg_internal() #[[ATTR10]]
; TUNIT-NEXT: ret void
;
; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write)
; CGSCC-LABEL: define {{[^@]+}}@writeonly_global_via_arg_internal
-; CGSCC-SAME: () #[[ATTR9]] {
-; CGSCC-NEXT: call void @write_global_via_arg_internal() #[[ATTR13]]
+; CGSCC-SAME: () #[[ATTR7]] {
+; CGSCC-NEXT: call void @write_global_via_arg_internal() #[[ATTR11]]
; CGSCC-NEXT: ret void
;
call void @write_global_via_arg_internal(ptr @G)
@@ -483,11 +471,11 @@ define void @writeonly_global_via_arg_internal() {
define i8 @recursive_not_readnone(ptr %ptr, i1 %c) {
; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: write)
; TUNIT-LABEL: define {{[^@]+}}@recursive_not_readnone
-; TUNIT-SAME: (ptr nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR9:[0-9]+]] {
+; TUNIT-SAME: (ptr nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR7:[0-9]+]] {
; TUNIT-NEXT: [[ALLOC:%.*]] = alloca i8, align 1
; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
; TUNIT: t:
-; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR13:[0-9]+]]
+; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR11:[0-9]+]]
; TUNIT-NEXT: ret i8 1
; TUNIT: f:
; TUNIT-NEXT: store i8 1, ptr [[PTR]], align 1
@@ -495,11 +483,11 @@ define i8 @recursive_not_readnone(ptr %ptr, i1 %c) {
;
; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: write)
; CGSCC-LABEL: define {{[^@]+}}@recursive_not_readnone
-; CGSCC-SAME: (ptr nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR10:[0-9]+]] {
+; CGSCC-SAME: (ptr nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR8:[0-9]+]] {
; CGSCC-NEXT: [[ALLOC:%.*]] = alloca i8, align 1
; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
; CGSCC: t:
-; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR14:[0-9]+]]
+; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR12:[0-9]+]]
; CGSCC-NEXT: ret i8 1
; CGSCC: f:
; CGSCC-NEXT: store i8 1, ptr [[PTR]], align 1
@@ -519,11 +507,11 @@ f:
define internal i8 @recursive_not_readnone_internal(ptr %ptr, i1 %c) {
; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: write)
; TUNIT-LABEL: define {{[^@]+}}@recursive_not_readnone_internal
-; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR9]] {
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR7]] {
; TUNIT-NEXT: [[ALLOC:%.*]] = alloca i8, align 1
; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
; TUNIT: t:
-; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR13]]
+; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR11]]
; TUNIT-NEXT: ret i8 1
; TUNIT: f:
; TUNIT-NEXT: store i8 1, ptr [[PTR]], align 1
@@ -531,11 +519,11 @@ define internal i8 @recursive_not_readnone_internal(ptr %ptr, i1 %c) {
;
; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: write)
; CGSCC-LABEL: define {{[^@]+}}@recursive_not_readnone_internal
-; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR10]] {
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR8]] {
; CGSCC-NEXT: [[ALLOC:%.*]] = alloca i8, align 1
; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
; CGSCC: t:
-; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR14]]
+; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR12]]
; CGSCC-NEXT: ret i8 1
; CGSCC: f:
; CGSCC-NEXT: store i8 1, ptr [[PTR]], align 1
@@ -555,16 +543,16 @@ f:
define i8 @readnone_caller(i1 %c) {
; TUNIT: Function Attrs: nofree norecurse nosync nounwind memory(none)
; TUNIT-LABEL: define {{[^@]+}}@readnone_caller
-; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR10:[0-9]+]] {
+; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR8:[0-9]+]] {
; TUNIT-NEXT: [[A:%.*]] = alloca i8, align 1
-; TUNIT-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[A]], i1 noundef [[C]]) #[[ATTR13]]
+; TUNIT-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[A]], i1 noundef [[C]]) #[[ATTR11]]
; TUNIT-NEXT: ret i8 [[R]]
;
; CGSCC: Function Attrs: nofree nosync nounwind memory(none)
; CGSCC-LABEL: define {{[^@]+}}@readnone_caller
-; CGSCC-SAME: (i1 noundef [[C:%.*]]) #[[ATTR11:[0-9]+]] {
+; CGSCC-SAME: (i1 noundef [[C:%.*]]) #[[ATTR9:[0-9]+]] {
; CGSCC-NEXT: [[A:%.*]] = alloca i8, align 1
-; CGSCC-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[A]], i1 noundef [[C]]) #[[ATTR15:[0-9]+]]
+; CGSCC-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[A]], i1 noundef [[C]]) #[[ATTR13:[0-9]+]]
; CGSCC-NEXT: ret i8 [[R]]
;
%a = alloca i8
@@ -575,11 +563,11 @@ define i8 @readnone_caller(i1 %c) {
define internal i8 @recursive_readnone_internal2(ptr %ptr, i1 %c) {
; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: write)
; TUNIT-LABEL: define {{[^@]+}}@recursive_readnone_internal2
-; TUNIT-SAME: (ptr noalias nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR9]] {
+; TUNIT-SAME: (ptr noalias nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR7]] {
; TUNIT-NEXT: [[ALLOC:%.*]] = alloca i8, align 1
; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
; TUNIT: t:
-; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_readnone_internal2(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR13]]
+; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_readnone_internal2(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR11]]
; TUNIT-NEXT: ret i8 1
; TUNIT: f:
; TUNIT-NEXT: store i8 1, ptr [[PTR]], align 1
@@ -587,11 +575,11 @@ define internal i8 @recursive_readnone_internal2(ptr %ptr, i1 %c) {
;
; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: write)
; CGSCC-LABEL: define {{[^@]+}}@recursive_readnone_internal2
-; CGSCC-SAME: (ptr noalias nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR10]] {
+; CGSCC-SAME: (ptr noalias nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR8]] {
; CGSCC-NEXT: [[ALLOC:%.*]] = alloca i8, align 1
; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
; CGSCC: t:
-; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_readnone_internal2(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR14]]
+; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_readnone_internal2(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR12]]
; CGSCC-NEXT: ret i8 1
; CGSCC: f:
; CGSCC-NEXT: store i8 1, ptr [[PTR]], align 1
@@ -611,14 +599,14 @@ f:
define i8 @readnone_caller2(i1 %c) {
; TUNIT: Function Attrs: nofree norecurse nosync nounwind memory(none)
; TUNIT-LABEL: define {{[^@]+}}@readnone_caller2
-; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR10]] {
-; TUNIT-NEXT: [[R:%.*]] = call i8 @recursive_readnone_internal2(ptr undef, i1 noundef [[C]]) #[[ATTR13]]
+; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR8]] {
+; TUNIT-NEXT: [[R:%.*]] = call i8 @recursive_readnone_internal2(ptr undef, i1 noundef [[C]]) #[[ATTR11]]
; TUNIT-NEXT: ret i8 [[R]]
;
; CGSCC: Function Attrs: nofree nosync nounwind memory(none)
; CGSCC-LABEL: define {{[^@]+}}@readnone_caller2
-; CGSCC-SAME: (i1 noundef [[C:%.*]]) #[[ATTR11]] {
-; CGSCC-NEXT: [[R:%.*]] = call i8 @recursive_readnone_internal2(ptr nofree undef, i1 noundef [[C]]) #[[ATTR15]]
+; CGSCC-SAME: (i1 noundef [[C:%.*]]) #[[ATTR9]] {
+; CGSCC-NEXT: [[R:%.*]] = call i8 @recursive_readnone_internal2(ptr nofree undef, i1 noundef [[C]]) #[[ATTR13]]
; CGSCC-NEXT: ret i8 [[R]]
;
%r = call i8 @recursive_readnone_internal2(ptr undef, i1 %c)
@@ -628,11 +616,11 @@ define i8 @readnone_caller2(i1 %c) {
define internal i8 @recursive_not_readnone_internal3(ptr %ptr, i1 %c) {
; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: write)
; TUNIT-LABEL: define {{[^@]+}}@recursive_not_readnone_internal3
-; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR9]] {
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR7]] {
; TUNIT-NEXT: [[ALLOC:%.*]] = alloca i8, align 1
; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
; TUNIT: t:
-; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR13]]
+; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR11]]
; TUNIT-NEXT: ret i8 1
; TUNIT: f:
; TUNIT-NEXT: store i8 1, ptr [[PTR]], align 1
@@ -640,11 +628,11 @@ define internal i8 @recursive_not_readnone_internal3(ptr %ptr, i1 %c) {
;
; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: write)
; CGSCC-LABEL: define {{[^@]+}}@recursive_not_readnone_internal3
-; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR10]] {
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR8]] {
; CGSCC-NEXT: [[ALLOC:%.*]] = alloca i8, align 1
; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
; CGSCC: t:
-; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR14]]
+; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR12]]
; CGSCC-NEXT: ret i8 1
; CGSCC: f:
; CGSCC-NEXT: store i8 1, ptr [[PTR]], align 1
@@ -664,16 +652,16 @@ f:
define i8 @readnone_caller3(i1 %c) {
; TUNIT: Function Attrs: nofree norecurse nosync nounwind memory(none)
; TUNIT-LABEL: define {{[^@]+}}@readnone_caller3
-; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR10]] {
+; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR8]] {
; TUNIT-NEXT: [[ALLOC:%.*]] = alloca i8, align 1
-; TUNIT-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef [[C]]) #[[ATTR13]]
+; TUNIT-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef [[C]]) #[[ATTR11]]
; TUNIT-NEXT: ret i8 [[R]]
;
; CGSCC: Function Attrs: nofree nosync nounwind memory(none)
; CGSCC-LABEL: define {{[^@]+}}@readnone_caller3
-; CGSCC-SAME: (i1 noundef [[C:%.*]]) #[[ATTR11]] {
+; CGSCC-SAME: (i1 noundef [[C:%.*]]) #[[ATTR9]] {
; CGSCC-NEXT: [[ALLOC:%.*]] = alloca i8, align 1
-; CGSCC-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef [[C]]) #[[ATTR15]]
+; CGSCC-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef [[C]]) #[[ATTR13]]
; CGSCC-NEXT: ret i8 [[R]]
;
%alloc = alloca i8
@@ -684,7 +672,7 @@ define i8 @readnone_caller3(i1 %c) {
define internal void @argmemonly_before_ipconstprop(ptr %p) argmemonly {
; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none)
; CHECK-LABEL: define {{[^@]+}}@argmemonly_before_ipconstprop
-; CHECK-SAME: () #[[ATTR8]] {
+; CHECK-SAME: () #[[ATTR6]] {
; CHECK-NEXT: store i32 0, ptr @G, align 4
; CHECK-NEXT: ret void
;
@@ -695,14 +683,14 @@ define internal void @argmemonly_before_ipconstprop(ptr %p) argmemonly {
define void @argmemonly_caller() {
; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
; TUNIT-LABEL: define {{[^@]+}}@argmemonly_caller
-; TUNIT-SAME: () #[[ATTR6]] {
-; TUNIT-NEXT: call void @argmemonly_before_ipconstprop() #[[ATTR12]]
+; TUNIT-SAME: () #[[ATTR4]] {
+; TUNIT-NEXT: call void @argmemonly_before_ipconstprop() #[[ATTR10]]
; TUNIT-NEXT: ret void
;
; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write)
; CGSCC-LABEL: define {{[^@]+}}@argmemonly_caller
-; CGSCC-SAME: () #[[ATTR9]] {
-; CGSCC-NEXT: call void @argmemonly_before_ipconstprop() #[[ATTR13]]
+; CGSCC-SAME: () #[[ATTR7]] {
+; CGSCC-NEXT: call void @argmemonly_before_ipconstprop() #[[ATTR11]]
; CGSCC-NEXT: ret void
;
call void @argmemonly_before_ipconstprop(ptr @G)
@@ -714,10 +702,10 @@ declare ptr @no_mem_unknown_ptr(ptr %arg) memory(none)
define void @argmem_and_unknown(i1 %c, ptr %arg) memory(argmem: readwrite) {
; TUNIT: Function Attrs: nosync memory(argmem: write)
; TUNIT-LABEL: define {{[^@]+}}@argmem_and_unknown
-; TUNIT-SAME: (i1 noundef [[C:%.*]], ptr writeonly [[ARG:%.*]]) #[[ATTR11:[0-9]+]] {
+; TUNIT-SAME: (i1 noundef [[C:%.*]], ptr writeonly [[ARG:%.*]]) #[[ATTR9:[0-9]+]] {
; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
; TUNIT: t:
-; TUNIT-NEXT: [[P:%.*]] = call ptr @no_mem_unknown_ptr(ptr noalias readnone [[ARG]]) #[[ATTR14:[0-9]+]]
+; TUNIT-NEXT: [[P:%.*]] = call ptr @no_mem_unknown_ptr(ptr noalias readnone [[ARG]]) #[[ATTR12:[0-9]+]]
; TUNIT-NEXT: store i32 0, ptr [[P]], align 4
; TUNIT-NEXT: br label [[F]]
; TUNIT: f:
@@ -725,10 +713,10 @@ define void @argmem_and_unknown(i1 %c, ptr %arg) memory(argmem: readwrite) {
;
; CGSCC: Function Attrs: nosync memory(argmem: write)
; CGSCC-LABEL: define {{[^@]+}}@argmem_and_unknown
-; CGSCC-SAME: (i1 noundef [[C:%.*]], ptr writeonly [[ARG:%.*]]) #[[ATTR12:[0-9]+]] {
+; CGSCC-SAME: (i1 noundef [[C:%.*]], ptr writeonly [[ARG:%.*]]) #[[ATTR10:[0-9]+]] {
; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
; CGSCC: t:
-; CGSCC-NEXT: [[P:%.*]] = call ptr @no_mem_unknown_ptr(ptr noalias readnone [[ARG]]) #[[ATTR16:[0-9]+]]
+; CGSCC-NEXT: [[P:%.*]] = call ptr @no_mem_unknown_ptr(ptr noalias readnone [[ARG]]) #[[ATTR14:[0-9]+]]
; CGSCC-NEXT: store i32 0, ptr [[P]], align 4
; CGSCC-NEXT: br label [[F]]
; CGSCC: f:
@@ -747,33 +735,29 @@ f:
; TUNIT: attributes #[[ATTR1]] = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
; TUNIT: attributes #[[ATTR2]] = { memory(none) }
; TUNIT: attributes #[[ATTR3]] = { memory(argmem: readwrite) }
-; TUNIT: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-; TUNIT: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
-; TUNIT: attributes #[[ATTR6]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write) }
-; TUNIT: attributes #[[ATTR7]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
-; TUNIT: attributes #[[ATTR8]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none) }
-; TUNIT: attributes #[[ATTR9]] = { nofree nosync nounwind memory(argmem: write) }
-; TUNIT: attributes #[[ATTR10]] = { nofree norecurse nosync nounwind memory(none) }
-; TUNIT: attributes #[[ATTR11]] = { nosync memory(argmem: write) }
-; TUNIT: attributes #[[ATTR12]] = { nofree nosync nounwind willreturn memory(write) }
-; TUNIT: attributes #[[ATTR13]] = { nofree nosync nounwind memory(write) }
-; TUNIT: attributes #[[ATTR14]] = { nosync }
+; TUNIT: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write) }
+; TUNIT: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
+; TUNIT: attributes #[[ATTR6]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none) }
+; TUNIT: attributes #[[ATTR7]] = { nofree nosync nounwind memory(argmem: write) }
+; TUNIT: attributes #[[ATTR8]] = { nofree norecurse nosync nounwind memory(none) }
+; TUNIT: attributes #[[ATTR9]] = { nosync memory(argmem: write) }
+; TUNIT: attributes #[[ATTR10]] = { nofree nosync nounwind willreturn memory(write) }
+; TUNIT: attributes #[[ATTR11]] = { nofree nosync nounwind memory(write) }
+; TUNIT: attributes #[[ATTR12]] = { nosync }
;.
; CGSCC: attributes #[[ATTR0]] = { memory(inaccessiblemem: readwrite) }
; CGSCC: attributes #[[ATTR1]] = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
; CGSCC: attributes #[[ATTR2]] = { memory(none) }
; CGSCC: attributes #[[ATTR3]] = { memory(argmem: readwrite) }
-; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
-; CGSCC: attributes #[[ATTR6]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write) }
-; CGSCC: attributes #[[ATTR7]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
-; CGSCC: attributes #[[ATTR8]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none) }
-; CGSCC: attributes #[[ATTR9]] = { mustprogress nofree nosync nounwind willreturn memory(write) }
-; CGSCC: attributes #[[ATTR10]] = { nofree nosync nounwind memory(argmem: write) }
-; CGSCC: attributes #[[ATTR11]] = { nofree nosync nounwind memory(none) }
-; CGSCC: attributes #[[ATTR12]] = { nosync memory(argmem: write) }
-; CGSCC: attributes #[[ATTR13]] = { nofree nounwind willreturn memory(write) }
-; CGSCC: attributes #[[ATTR14]] = { nofree nosync nounwind memory(write) }
-; CGSCC: attributes #[[ATTR15]] = { nofree nounwind memory(write) }
-; CGSCC: attributes #[[ATTR16]] = { nosync }
+; CGSCC: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write) }
+; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
+; CGSCC: attributes #[[ATTR6]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none) }
+; CGSCC: attributes #[[ATTR7]] = { mustprogress nofree nosync nounwind willreturn memory(write) }
+; CGSCC: attributes #[[ATTR8]] = { nofree nosync nounwind memory(argmem: write) }
+; CGSCC: attributes #[[ATTR9]] = { nofree nosync nounwind memory(none) }
+; CGSCC: attributes #[[ATTR10]] = { nosync memory(argmem: write) }
+; CGSCC: attributes #[[ATTR11]] = { nofree nounwind willreturn memory(write) }
+; CGSCC: attributes #[[ATTR12]] = { nofree nosync nounwind memory(write) }
+; CGSCC: attributes #[[ATTR13]] = { nofree nounwind memory(write) }
+; CGSCC: attributes #[[ATTR14]] = { nosync }
;.
diff --git a/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll b/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll
index 005c021..54782c5 100644
--- a/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll
+++ b/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll
@@ -18,11 +18,11 @@ bb:
br i1 %tmp4, label %bb6, label %bb5
bb5: ; preds = %bb
- call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %tmp1) #2
+ call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %tmp) #2
store i32 %tmp3, ptr %tmp, align 4, !tbaa !2
store i32 %tmp3, ptr @g, align 4, !tbaa !2
call void @bar(ptr nonnull %tmp) #2
- call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %tmp1) #2
+ call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %tmp) #2
br label %bb6
bb6: ; preds = %bb5, %bb
diff --git a/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll b/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll
index 03ff31b..e9d5fb6 100644
--- a/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll
+++ b/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll
@@ -9,8 +9,7 @@
define void @_Z3foov() local_unnamed_addr {
bb:
%tmp = alloca %class.A, align 1
- %tmp1 = getelementptr inbounds %class.A, ptr %tmp, i64 0, i32 0
- call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %tmp1)
+ call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %tmp)
%tmp2 = load i32, ptr @cond, align 4, !tbaa !2
%tmp3 = icmp eq i32 %tmp2, 0
br i1 %tmp3, label %bb4, label %bb5
@@ -20,7 +19,7 @@ bb4: ; preds = %bb
br label %bb5
bb5: ; preds = %bb4, %bb
- call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %tmp1)
+ call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %tmp)
ret void
}
@@ -38,7 +37,6 @@ define void @_Z3goov() local_unnamed_addr {
bb:
; CHECK: bb:
; CHECK-NOT: alloca
-; CHECK-NOT: getelementptr
; CHECK-NOT: llvm.lifetime
; CHECK: br i1
; CHECK: codeRepl.i:
@@ -50,7 +48,6 @@ bb:
; CHECK-LABEL: define internal void @_Z3foov.1.
; CHECK: newFuncRoot:
; CHECK-NEXT: %tmp = alloca %class.A
-; CHECK-NEXT: %tmp1 = getelementptr
; CHECK-NEXT: call void @llvm.lifetime.start.p0
; CHECK: call void @llvm.lifetime.end.p0
; CHECK-NEXT: br label %bb5.exitStub
diff --git a/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll
index 9b5362d..6bf268b 100644
--- a/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll
+++ b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll
@@ -61,10 +61,11 @@ entry:
declare i64 @llvm.aarch64.udiv.i64.i64(i64, i64)
-define void @test_free_intrinsics(i64 %x, ptr %ptr) {
+define void @test_free_intrinsics(i64 %x) {
; CHECK-LABEL: @test_free_intrinsics(
; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100000000032, ptr [[PTR:%.*]])
+; CHECK-NEXT: [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100000000032, ptr [[PTR]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100000000064, ptr [[PTR]])
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 100000000128, ptr [[PTR]])
; CHECK-NEXT: [[I:%.*]] = call ptr @llvm.invariant.start.p0(i64 100000000256, ptr [[PTR]])
@@ -72,6 +73,7 @@ define void @test_free_intrinsics(i64 %x, ptr %ptr) {
; CHECK-NEXT: ret void
;
entry:
+ %ptr = alloca i8
call void @llvm.lifetime.start.p0(i64 100000000032, ptr %ptr)
call void @llvm.lifetime.start.p0(i64 100000000064, ptr %ptr)
call void @llvm.lifetime.end.p0(i64 100000000128, ptr %ptr)
diff --git a/llvm/test/Transforms/DCE/basic.ll b/llvm/test/Transforms/DCE/basic.ll
index 134994a..1a3b12e 100644
--- a/llvm/test/Transforms/DCE/basic.ll
+++ b/llvm/test/Transforms/DCE/basic.ll
@@ -26,47 +26,5 @@ define i32 @test_lifetime_alloca() {
ret i32 0
}
-; CHECK-LABEL: @test_lifetime_arg
-define i32 @test_lifetime_arg(ptr) {
-; Check that lifetime intrinsics are removed along with the pointer.
-; CHECK-NEXT: #dbg_value
-; CHECK-NEXT: ret i32 0
-; CHECK-NOT: llvm.lifetime.start
-; CHECK-NOT: llvm.lifetime.end
- call void @llvm.lifetime.start.p0(i64 -1, ptr %0)
- call void @llvm.lifetime.end.p0(i64 -1, ptr %0)
- ret i32 0
-}
-
-@glob = global i8 1
-
-; CHECK-LABEL: @test_lifetime_global
-define i32 @test_lifetime_global() {
-; Check that lifetime intrinsics are removed along with the pointer.
-; CHECK-NEXT: #dbg_value
-; CHECK-NEXT: ret i32 0
-; CHECK-NOT: llvm.lifetime.start
-; CHECK-NOT: llvm.lifetime.end
- call void @llvm.lifetime.start.p0(i64 -1, ptr @glob)
- call void @llvm.lifetime.end.p0(i64 -1, ptr @glob)
- ret i32 0
-}
-
-; CHECK-LABEL: @test_lifetime_bitcast
-define i32 @test_lifetime_bitcast(ptr %arg) {
-; Check that lifetime intrinsics are NOT removed when the pointer is a bitcast.
-; It's not uncommon for two bitcasts to be made: one for lifetime, one for use.
-; TODO: Support the above case.
-; CHECK-NEXT: bitcast
-; CHECK-NEXT: #dbg_value
-; CHECK-NEXT: llvm.lifetime.start.p0(i64 -1, ptr %cast)
-; CHECK-NEXT: llvm.lifetime.end.p0(i64 -1, ptr %cast)
-; CHECK-NEXT: ret i32 0
- %cast = bitcast ptr %arg to ptr
- call void @llvm.lifetime.start.p0(i64 -1, ptr %cast)
- call void @llvm.lifetime.end.p0(i64 -1, ptr %cast)
- ret i32 0
-}
-
; CHECK: [[add]] = !DILocalVariable
; CHECK: [[sub]] = !DILocalVariable
diff --git a/llvm/test/Transforms/DeadStoreElimination/libcalls.ll b/llvm/test/Transforms/DeadStoreElimination/libcalls.ll
index 4d9a767..27ad639 100644
--- a/llvm/test/Transforms/DeadStoreElimination/libcalls.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/libcalls.ll
@@ -67,19 +67,6 @@ define void @test_strcat_with_lifetime(ptr %src) {
ret void
}
-define void @test_strcat_with_lifetime_nonlocal(ptr %dest, ptr %src) {
-; CHECK-LABEL: @test_strcat_with_lifetime_nonlocal(
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[DEST:%.*]])
-; CHECK-NEXT: [[CALL:%.*]] = call ptr @strcat(ptr [[DEST]], ptr [[SRC:%.*]])
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[DEST]])
-; CHECK-NEXT: ret void
-;
- call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %dest)
- %call = call ptr @strcat(ptr %dest, ptr %src)
- call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %dest)
- ret void
-}
-
declare ptr @strncat(ptr %dest, ptr %src, i64 %n) nounwind
define void @test4(ptr %src) {
; CHECK-LABEL: @test4(
diff --git a/llvm/test/Transforms/DeadStoreElimination/lifetime.ll b/llvm/test/Transforms/DeadStoreElimination/lifetime.ll
index 73b9903..19e7b0d 100644
--- a/llvm/test/Transforms/DeadStoreElimination/lifetime.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/lifetime.ll
@@ -25,12 +25,12 @@ define void @test1() {
define void @test2(ptr %P) {
; CHECK-LABEL: @test2(
-; CHECK-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 1
+; CHECK-NEXT: [[Q:%.*]] = alloca i32, align 4
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[Q]])
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[Q]])
; CHECK-NEXT: ret void
;
- %Q = getelementptr i32, ptr %P, i32 1
+ %Q = alloca i32
call void @llvm.lifetime.start.p0(i64 4, ptr %Q)
store i32 0, ptr %Q ;; This store is dead.
call void @llvm.lifetime.end.p0(i64 4, ptr %Q)
@@ -114,19 +114,19 @@ exit:
; lifetime.end only marks the first two bytes of %A as dead. Make sure
; `store i8 20, ptr %A.2 is not removed.
-define void @test5_lifetime_end_partial(ptr %A) {
+define void @test5_lifetime_end_partial() {
; CHECK-LABEL: @test5_lifetime_end_partial(
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[A:%.*]])
+; CHECK-NEXT: [[A:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[A]])
; CHECK-NEXT: [[A_1:%.*]] = getelementptr i8, ptr [[A]], i64 1
; CHECK-NEXT: [[A_2:%.*]] = getelementptr i8, ptr [[A]], i64 2
; CHECK-NEXT: store i8 20, ptr [[A_2]], align 1
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr [[A]])
; CHECK-NEXT: call void @use(ptr [[A_1]])
-; CHECK-NEXT: store i8 30, ptr [[A_1]], align 1
-; CHECK-NEXT: store i8 40, ptr [[A_2]], align 1
; CHECK-NEXT: ret void
;
+ %A = alloca [4 x i8]
call void @llvm.lifetime.start.p0(i64 2, ptr %A)
%A.1 = getelementptr i8, ptr %A, i64 1
%A.2 = getelementptr i8, ptr %A, i64 2
diff --git a/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll b/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll
index 95bd859..588bdc0 100644
--- a/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll
@@ -398,7 +398,7 @@ bb5:
@linenum = external local_unnamed_addr global i32, align 4
-define void @accessible_after_return11_loop() {
+define void @accessible_after_return11_loop(ptr noalias %p) {
; CHECK-LABEL: @accessible_after_return11_loop(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY_I:%.*]]
@@ -406,7 +406,7 @@ define void @accessible_after_return11_loop() {
; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond()
; CHECK-NEXT: br i1 [[C_1]], label [[FOR_BODY_I]], label [[INIT_PARSE_EXIT:%.*]]
; CHECK: init_parse.exit:
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull undef)
+; CHECK-NEXT: store i32 1, ptr [[P:%.*]], align 4
; CHECK-NEXT: store i32 0, ptr @linenum, align 4
; CHECK-NEXT: br label [[FOR_BODY_I20:%.*]]
; CHECK: for.body.i20:
@@ -424,7 +424,7 @@ for.body.i: ; preds = %for.body.i, %entry
init_parse.exit: ; preds = %for.body.i
store i32 0, ptr @linenum, align 4
- call void @llvm.lifetime.end.p0(i64 16, ptr nonnull undef) #2
+ store i32 1, ptr %p
store i32 0, ptr @linenum, align 4
br label %for.body.i20
@@ -435,7 +435,6 @@ for.body.i20: ; preds = %for.body.i20, %init
exit:
ret void
}
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
declare i1 @cond() readnone nounwind
; Tests where the pointer/object is *NOT* accessible after the function returns.
diff --git a/llvm/test/Transforms/EarlyCSE/memoryssa.ll b/llvm/test/Transforms/EarlyCSE/memoryssa.ll
index 942b6f8..ba4cce4 100644
--- a/llvm/test/Transforms/EarlyCSE/memoryssa.ll
+++ b/llvm/test/Transforms/EarlyCSE/memoryssa.ll
@@ -142,10 +142,12 @@ end:
;; Check that we respect lifetime.start/lifetime.end intrinsics when deleting
;; stores that, without the lifetime calls, would be writebacks.
-define void @test_writeback_lifetimes(ptr %p) {
+define void @test_writeback_lifetimes() {
; CHECK-NOMEMSSA-LABEL: @test_writeback_lifetimes(
; CHECK-NOMEMSSA-NEXT: entry:
-; CHECK-NOMEMSSA-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1
+; CHECK-NOMEMSSA-NEXT: [[P:%.*]] = alloca i64, align 8
+; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]])
+; CHECK-NOMEMSSA-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P]], i64 1
; CHECK-NOMEMSSA-NEXT: [[PV:%.*]] = load i32, ptr [[P]], align 4
; CHECK-NOMEMSSA-NEXT: [[QV:%.*]] = load i32, ptr [[Q]], align 4
; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[P]])
@@ -156,7 +158,9 @@ define void @test_writeback_lifetimes(ptr %p) {
;
; CHECK-LABEL: @test_writeback_lifetimes(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1
+; CHECK-NEXT: [[P:%.*]] = alloca i64, align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]])
+; CHECK-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P]], i64 1
; CHECK-NEXT: [[PV:%.*]] = load i32, ptr [[P]], align 4
; CHECK-NEXT: [[QV:%.*]] = load i32, ptr [[Q]], align 4
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[P]])
@@ -166,6 +170,8 @@ define void @test_writeback_lifetimes(ptr %p) {
; CHECK-NEXT: ret void
;
entry:
+ %p = alloca i64
+ call void @llvm.lifetime.start.p0(i64 8, ptr %p)
%q = getelementptr i32, ptr %p, i64 1
%pv = load i32, ptr %p
%qv = load i32, ptr %q
@@ -178,10 +184,12 @@ entry:
;; Check that we respect lifetime.start/lifetime.end intrinsics when deleting
;; stores that, without the lifetime calls, would be writebacks.
-define void @test_writeback_lifetimes_multi_arg(ptr %p, ptr %q) {
+define void @test_writeback_lifetimes_multi_arg(ptr %q) {
; CHECK-NOMEMSSA-LABEL: @test_writeback_lifetimes_multi_arg(
; CHECK-NOMEMSSA-NEXT: entry:
-; CHECK-NOMEMSSA-NEXT: [[PV:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NOMEMSSA-NEXT: [[P:%.*]] = alloca i64, align 8
+; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]])
+; CHECK-NOMEMSSA-NEXT: [[PV:%.*]] = load i32, ptr [[P]], align 4
; CHECK-NOMEMSSA-NEXT: [[QV:%.*]] = load i32, ptr [[Q:%.*]], align 4
; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[P]])
; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]])
@@ -191,15 +199,18 @@ define void @test_writeback_lifetimes_multi_arg(ptr %p, ptr %q) {
;
; CHECK-LABEL: @test_writeback_lifetimes_multi_arg(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[PV:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[P:%.*]] = alloca i64, align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]])
+; CHECK-NEXT: [[PV:%.*]] = load i32, ptr [[P]], align 4
; CHECK-NEXT: [[QV:%.*]] = load i32, ptr [[Q:%.*]], align 4
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[P]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]])
; CHECK-NEXT: store i32 [[PV]], ptr [[P]], align 4
-; CHECK-NEXT: store i32 [[QV]], ptr [[Q]], align 4
; CHECK-NEXT: ret void
;
entry:
+ %p = alloca i64
+ call void @llvm.lifetime.start.p0(i64 8, ptr %p)
%pv = load i32, ptr %p
%qv = load i32, ptr %q
call void @llvm.lifetime.end.p0(i64 8, ptr %p)
diff --git a/llvm/test/Transforms/GVN/assume.ll b/llvm/test/Transforms/GVN/assume.ll
index 1498aa4..5d3a23b 100644
--- a/llvm/test/Transforms/GVN/assume.ll
+++ b/llvm/test/Transforms/GVN/assume.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=gvn -verify-analysis-invalidation -S | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt < %s -passes='gvn<memoryssa;no-memdep>' -verify-analysis-invalidation -S | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt < %s -passes='gvn<memoryssa>' -verify-analysis-invalidation -S | FileCheck --check-prefixes=CHECK,MSSA %s
declare void @llvm.assume(i1)
declare void @use(i1)
diff --git a/llvm/test/Transforms/GVN/basic.ll b/llvm/test/Transforms/GVN/basic.ll
index c1a358a..2e360aa 100644
--- a/llvm/test/Transforms/GVN/basic.ll
+++ b/llvm/test/Transforms/GVN/basic.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -passes=gvn -S | FileCheck %s --check-prefixes=CHECK,MDEP
-; RUN: opt < %s -passes='gvn<memoryssa;no-memdep>' -S | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt < %s -passes='gvn<memoryssa>' -S | FileCheck --check-prefixes=CHECK,MSSA %s
define i32 @main() {
; CHECK-LABEL: define i32 @main() {
diff --git a/llvm/test/Transforms/GVN/lifetime-simple.ll b/llvm/test/Transforms/GVN/lifetime-simple.ll
index bf7a6ef..177f43f 100644
--- a/llvm/test/Transforms/GVN/lifetime-simple.ll
+++ b/llvm/test/Transforms/GVN/lifetime-simple.ll
@@ -1,13 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -passes=gvn -S | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin7"
-
-define i8 @test(ptr %P) nounwind {
-; CHECK: lifetime.start
-; CHECK-NOT: load
-; CHECK: lifetime.end
+define i8 @test() nounwind {
+; CHECK-LABEL: define i8 @test(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[P:%.*]] = alloca [32 x i8], align 1
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[P]])
+; CHECK-NEXT: store i8 1, ptr [[P]], align 1
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[P]])
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT: ret i8 [[TMP0]]
+;
entry:
+ %P = alloca [32 x i8]
call void @llvm.lifetime.start.p0(i64 32, ptr %P)
%0 = load i8, ptr %P
store i8 1, ptr %P
diff --git a/llvm/test/Transforms/GVN/nonescaping.ll b/llvm/test/Transforms/GVN/nonescaping.ll
index 2913755..0866a27 100644
--- a/llvm/test/Transforms/GVN/nonescaping.ll
+++ b/llvm/test/Transforms/GVN/nonescaping.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -S -passes=gvn 2>&1 | FileCheck %s --check-prefixes=CHECK,MDEP
-; RUN: opt < %s -S -passes='gvn<memoryssa;no-memdep>' 2>&1 | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt < %s -S -passes='gvn<memoryssa>' 2>&1 | FileCheck --check-prefixes=CHECK,MSSA %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/llvm/test/Transforms/GVN/opt-remarks.ll b/llvm/test/Transforms/GVN/opt-remarks.ll
index 8fb2d57..87cd54d 100644
--- a/llvm/test/Transforms/GVN/opt-remarks.ll
+++ b/llvm/test/Transforms/GVN/opt-remarks.ll
@@ -107,7 +107,8 @@ entry:
ret i32 %add
}
-define i8 @lifetime_end(ptr %p, i8 %val) {
+define i8 @lifetime_end(i8 %val) {
+ %p = alloca [32 x i8]
call void @llvm.lifetime.start.p0(i64 32, ptr %p)
store i8 %val, ptr %p
call void @llvm.lifetime.end.p0(i64 32, ptr %p)
diff --git a/llvm/test/Transforms/GVN/phi.ll b/llvm/test/Transforms/GVN/phi.ll
index 5b607f7..a0207cf 100644
--- a/llvm/test/Transforms/GVN/phi.ll
+++ b/llvm/test/Transforms/GVN/phi.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -passes=gvn < %s | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt -S -passes='gvn<memoryssa;no-memdep>' < %s | FileCheck %s
+; RUN: opt -S -passes='gvn<memoryssa>' < %s | FileCheck %s
define i64 @test1(i1 %c, i64 %a, i64 %b) {
diff --git a/llvm/test/Transforms/GVN/pr14166.ll b/llvm/test/Transforms/GVN/pr14166.ll
index bbc8c89..6e23bdc 100644
--- a/llvm/test/Transforms/GVN/pr14166.ll
+++ b/llvm/test/Transforms/GVN/pr14166.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -disable-basic-aa -passes=gvn -S < %s | FileCheck %s --check-prefixes=CHECK,MDEP
-; RUN: opt -disable-basic-aa -passes='gvn<memoryssa;no-memdep>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt -disable-basic-aa -passes='gvn<memoryssa>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s
target datalayout = "e-p:32:32:32"
define <2 x i32> @test1() {
; MDEP-LABEL: define <2 x i32> @test1() {
diff --git a/llvm/test/Transforms/GVN/pre-compare.ll b/llvm/test/Transforms/GVN/pre-compare.ll
index 574d40d..c4f083b 100644
--- a/llvm/test/Transforms/GVN/pre-compare.ll
+++ b/llvm/test/Transforms/GVN/pre-compare.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes=gvn -S < %s | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt -passes='gvn<memoryssa;no-memdep>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt -passes='gvn<memoryssa>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s
; C source:
;
diff --git a/llvm/test/Transforms/GVN/readattrs.ll b/llvm/test/Transforms/GVN/readattrs.ll
index be018834..6e02dd3 100644
--- a/llvm/test/Transforms/GVN/readattrs.ll
+++ b/llvm/test/Transforms/GVN/readattrs.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes=gvn -S -o - < %s | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt -passes='gvn<memoryssa;no-memdep>' -S -o - < %s | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt -passes='gvn<memoryssa>' -S -o - < %s | FileCheck --check-prefixes=CHECK,MSSA %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/GVN/setjmp.ll b/llvm/test/Transforms/GVN/setjmp.ll
index 7777038..53518784 100644
--- a/llvm/test/Transforms/GVN/setjmp.ll
+++ b/llvm/test/Transforms/GVN/setjmp.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -passes=gvn < %s | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt -S -passes='gvn<memoryssa;no-memdep>' -verify-analysis-invalidation < %s | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt -S -passes='gvn<memoryssa>' -verify-analysis-invalidation < %s | FileCheck --check-prefixes=CHECK,MSSA %s
declare i32 @setjmp() returns_twice
declare void @longjmp()
declare ptr @malloc(i64)
diff --git a/llvm/test/Transforms/GVN/tbaa.ll b/llvm/test/Transforms/GVN/tbaa.ll
index 366dfec..59ace14 100644
--- a/llvm/test/Transforms/GVN/tbaa.ll
+++ b/llvm/test/Transforms/GVN/tbaa.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes=gvn -S < %s | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt -passes='gvn<memoryssa;no-memdep>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt -passes='gvn<memoryssa>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s
define i32 @test1(ptr %p, ptr %q) {
; MDEP-LABEL: define i32 @test1(
diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll
index 646a67d..5d6c559 100644
--- a/llvm/test/Transforms/GVN/vscale.ll
+++ b/llvm/test/Transforms/GVN/vscale.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S < %s -passes=gvn,dce | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt -S < %s -passes='gvn<memoryssa;no-memdep>',dce | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt -S < %s -passes='gvn<memoryssa>',dce | FileCheck --check-prefixes=CHECK,MSSA %s
; Analyze Load from clobbering Load.
diff --git a/llvm/test/Transforms/GVNSink/lifetime.ll b/llvm/test/Transforms/GVNSink/lifetime.ll
new file mode 100644
index 0000000..1a8a69b
--- /dev/null
+++ b/llvm/test/Transforms/GVNSink/lifetime.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=gvn-sink < %s | FileCheck %s
+
+; Make sure we do not sink lifetime markers if this would introduce a
+; lifetime with non-alloca operand.
+
+define void @test_cant_sink(i1 %c) {
+; CHECK-LABEL: define void @test_cant_sink(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT: [[B:%.*]] = alloca i8, align 1
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[B]])
+; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK: [[IF]]:
+; CHECK-NEXT: store i64 1, ptr [[A]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[A]])
+; CHECK-NEXT: br label %[[JOIN:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: store i64 1, ptr [[B]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[B]])
+; CHECK-NEXT: br label %[[JOIN]]
+; CHECK: [[JOIN]]:
+; CHECK-NEXT: ret void
+;
+ %a = alloca i8
+ %b = alloca i8
+ call void @llvm.lifetime.start(i64 1, ptr %a)
+ call void @llvm.lifetime.start(i64 1, ptr %b)
+ br i1 %c, label %if, label %else
+
+if:
+ store i64 1, ptr %a
+ call void @llvm.lifetime.end(i64 1, ptr %a)
+ br label %join
+
+else:
+ store i64 1, ptr %b
+ call void @llvm.lifetime.end(i64 1, ptr %b)
+ br label %join
+
+join:
+ ret void
+}
+
+define void @test_can_sink(i1 %c) {
+; CHECK-LABEL: define void @test_can_sink(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK: [[IF]]:
+; CHECK-NEXT: br label %[[JOIN:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: br label %[[JOIN]]
+; CHECK: [[JOIN]]:
+; CHECK-NEXT: store i64 1, ptr [[A]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[A]])
+; CHECK-NEXT: ret void
+;
+ %a = alloca i8
+ call void @llvm.lifetime.start(i64 1, ptr %a)
+ br i1 %c, label %if, label %else
+
+if:
+ store i64 1, ptr %a
+ call void @llvm.lifetime.end(i64 1, ptr %a)
+ br label %join
+
+else:
+ store i64 1, ptr %a
+ call void @llvm.lifetime.end(i64 1, ptr %a)
+ br label %join
+
+join:
+ ret void
+}
diff --git a/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-0.ll b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-0.ll
new file mode 100644
index 0000000..258bcfb
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-0.ll
@@ -0,0 +1,15 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \
+; RUN: %s 2>&1 | FileCheck %s
+
+; CHECK: error: The first element in the Indirection Table must be an integer; %struct.anon.1 = type { ptr, ptr } is incorrect.
+%struct.anon.1 = type { ptr, ptr }
+%class.anon = type { %struct.anon.1, ptr, %struct.anon.1 }
+@a = external hidden local_unnamed_addr addrspace(1) global ptr, align 8
+@__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant %class.anon zeroinitializer, align 8
+
+define amdgpu_kernel void @store(ptr %p) {
+entry:
+ store ptr %p, ptr addrspace(1) @a, align 8
+ ret void
+}
diff --git a/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-1.ll b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-1.ll
new file mode 100644
index 0000000..331f4bf9
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-1.ll
@@ -0,0 +1,15 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \
+; RUN: %s 2>&1 | FileCheck %s
+
+; CHECK: error: The second element in the Indirection Table must be a pointer; %struct.anon.1 = type { ptr, ptr } is incorrect.
+%struct.anon.1 = type { ptr, ptr }
+%class.anon = type { i64, %struct.anon.1, %struct.anon.1 }
+@a = external hidden local_unnamed_addr addrspace(1) global ptr, align 8
+@__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant %class.anon zeroinitializer, align 8
+
+define amdgpu_kernel void @store(ptr %p) {
+entry:
+ store ptr %p, ptr addrspace(1) @a, align 8
+ ret void
+}
diff --git a/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-2.ll b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-2.ll
new file mode 100644
index 0000000..6bdedcb
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-2.ll
@@ -0,0 +1,15 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \
+; RUN: %s 2>&1 | FileCheck %s
+
+; CHECK: error: The third element in the Indirection Table must be a struct type; i64 is incorrect.
+%struct.anon.1 = type { ptr, ptr }
+%class.anon = type { i64, ptr, i64 }
+@a = external hidden local_unnamed_addr addrspace(1) global ptr, align 8
+@__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant %class.anon zeroinitializer, align 8
+
+define amdgpu_kernel void @store(ptr %p) {
+entry:
+ store ptr %p, ptr addrspace(1) @a, align 8
+ ret void
+}
diff --git a/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-count.ll b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-count.ll
new file mode 100644
index 0000000..cf0efa0
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-count.ll
@@ -0,0 +1,14 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \
+; RUN: %s 2>&1 | FileCheck %s
+
+; CHECK: error: The Indirection Table must have 3 elements; 2 is incorrect.
+%class.anon = type { i64, ptr }
+@a = external hidden local_unnamed_addr addrspace(1) global ptr, align 8
+@__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant %class.anon zeroinitializer, align 8
+
+define amdgpu_kernel void @store(ptr %p) {
+entry:
+ store ptr %p, ptr addrspace(1) @a, align 8
+ ret void
+}
diff --git a/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-type.ll b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-type.ll
new file mode 100644
index 0000000..f32e378
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-type.ll
@@ -0,0 +1,13 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \
+; RUN: %s 2>&1 | FileCheck %s
+
+; CHECK: error: The Indirection Table must be a struct type; ptr is incorrect.
+@a = external hidden local_unnamed_addr addrspace(1) global ptr, align 8
+@__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant ptr zeroinitializer, align 8
+
+define amdgpu_kernel void @store(ptr %p) {
+entry:
+ store ptr %p, ptr addrspace(1) @a, align 8
+ ret void
+}
diff --git a/llvm/test/Transforms/HipStdPar/global-var-indirection.ll b/llvm/test/Transforms/HipStdPar/global-var-indirection.ll
new file mode 100644
index 0000000..98cace6
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/global-var-indirection.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --check-globals all --version 5
+; REQUIRES: amdgpu-registered-target
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \
+; RUN: %s | FileCheck %s
+
+%class.anon = type { i64, ptr, %struct.anon.1 }
+%struct.anon.1 = type { ptr, ptr }
+%struct.A = type { i32, i32, i32, i32, i32, double, [205 x double], [2000 x i32], [52000 x i32], [156000 x double], [14823 x double] }
+
+@do_not_indirect = protected addrspace(4) externally_initialized constant [4 x double] [double 1.000000e+00, double 1.000000e+00, double 2.000000e+00, double 6.000000e+00], align 16
+@a = external hidden local_unnamed_addr addrspace(1) global %struct.A, align 8
+@b = external hidden local_unnamed_addr addrspace(1) global ptr, align 8
+@c = internal addrspace(1) global { i32 } zeroinitializer, align 4
+@d = external hidden local_unnamed_addr addrspace(1) global ptr addrspace(1), align 8
+@__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant %class.anon zeroinitializer, align 8
+
+declare i64 @fn(i64 %x, i32 %y, i64 %z, i64 %w)
+
+;.
+; CHECK: @do_not_indirect = protected addrspace(4) externally_initialized constant [4 x double] [double 1.000000e+00, double 1.000000e+00, double 2.000000e+00, double 6.000000e+00], align 16
+; CHECK: @[[GLOB0:[0-9]+]] = private addrspace(1) constant [2 x i8] c"a\00"
+; CHECK: @[[GLOB1:[0-9]+]] = private addrspace(1) externally_initialized constant ptr addrspace(1) poison
+; CHECK: @[[GLOB2:[0-9]+]] = private addrspace(1) constant [2 x i8] c"b\00"
+; CHECK: @[[GLOB3:[0-9]+]] = private addrspace(1) externally_initialized constant ptr addrspace(1) poison
+; CHECK: @[[GLOB4:[0-9]+]] = private addrspace(1) constant [2 x i8] c"c\00"
+; CHECK: @[[GLOB5:[0-9]+]] = private addrspace(1) externally_initialized constant ptr addrspace(1) poison
+; CHECK: @[[GLOB6:[0-9]+]] = private addrspace(1) constant [2 x i8] c"d\00"
+; CHECK: @[[GLOB7:[0-9]+]] = private addrspace(1) externally_initialized constant ptr addrspace(1) poison
+; CHECK: @[[GLOB8:[0-9]+]] = private addrspace(1) constant [4 x %struct.anon.1] [%struct.anon.1 { ptr addrspacecast (ptr addrspace(1) @[[GLOB0]] to ptr), ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr) }, %struct.anon.1 { ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr) }, %struct.anon.1 { ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), ptr addrspacecast (ptr addrspace(1) @[[GLOB5]] to ptr) }, %struct.anon.1 { ptr addrspacecast (ptr addrspace(1) @[[GLOB6]] to ptr), ptr addrspacecast (ptr addrspace(1) @[[GLOB7]] to ptr) }]
+; CHECK: @__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant %class.anon { i64 4, ptr addrspacecast (ptr addrspace(1) @[[GLOB8]] to ptr), %struct.anon.1 poison }, align 8
+;.
+define double @gep(i64 %idx) {
+; CHECK-LABEL: define double @gep(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(1) @[[GLOB1]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[TMP0]], i64 217672
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [156000 x double], ptr addrspace(1) [[TMP1]], i64 0, i64 [[IDX]]
+; CHECK-NEXT: [[R:%.*]] = load double, ptr addrspace(1) [[ARRAYIDX]], align 8
+; CHECK-NEXT: ret double [[R]]
+;
+entry:
+ %arrayidx = getelementptr inbounds [156000 x double], ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @a, i64 217672), i64 0, i64 %idx
+ %r = load double, ptr addrspace(1) %arrayidx, align 8
+ ret double %r
+}
+
+define void @store(ptr %p) {
+; CHECK-LABEL: define void @store(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(1) @[[GLOB3]], align 8
+; CHECK-NEXT: store ptr [[P]], ptr addrspace(1) [[TMP0]], align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ store ptr %p, ptr addrspace(1) @b, align 8
+ ret void
+}
+
+define i64 @chain(i64 %x, i32 %y, i64 %z) {
+; CHECK-LABEL: define i64 @chain(
+; CHECK-SAME: i64 [[X:%.*]], i32 [[Y:%.*]], i64 [[Z:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(1) @[[GLOB5]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr
+; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @fn(i64 [[X]], i32 [[Y]], i64 [[TMP2]], i64 [[Z]])
+; CHECK-NEXT: ret i64 [[TMP3]]
+;
+entry:
+ %0 = call i64 @fn(i64 %x, i32 %y, i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @c to ptr) to i64), i64 %z)
+ ret i64 %0
+}
+
+define void @direct(ptr %p, i64 %n) {
+; CHECK-LABEL: define void @direct(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(1) @[[GLOB7]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP0]], align 8
+; CHECK-NEXT: tail call void @llvm.memcpy.p0.p1.i64(ptr align 4 [[P]], ptr addrspace(1) align 4 [[TMP1]], i64 [[N]], i1 false)
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load ptr addrspace(1), ptr addrspace(1) @d, align 8
+ tail call void @llvm.memcpy.p0.p1.i64(ptr align 4 %p, ptr addrspace(1) align 4 %0, i64 %n, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @ensure_reachable(ptr %p, i64 %idx, i64 %x, i32 %y, i64 %z) {
+; CHECK-LABEL: define amdgpu_kernel void @ensure_reachable(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[IDX:%.*]], i64 [[X:%.*]], i32 [[Y:%.*]], i64 [[Z:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @store(ptr [[P]])
+; CHECK-NEXT: [[TMP0:%.*]] = call double @gep(i64 [[IDX]])
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 @chain(i64 [[X]], i32 [[Y]], i64 [[Z]])
+; CHECK-NEXT: call void @direct(ptr [[P]], i64 [[X]])
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @store(ptr %p)
+ %0 = call double @gep(i64 %idx)
+ %1 = call i64 @chain(i64 %x, i32 %y, i64 %z)
+ call void @direct(ptr %p, i64 %x)
+ ret void
+}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+;.
diff --git a/llvm/test/Transforms/HipStdPar/global-var.ll b/llvm/test/Transforms/HipStdPar/global-var.ll
index 860c30e..3a22a7b 100644
--- a/llvm/test/Transforms/HipStdPar/global-var.ll
+++ b/llvm/test/Transforms/HipStdPar/global-var.ll
@@ -2,8 +2,8 @@
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \
; RUN: %s | FileCheck %s
-; CHECK: @var = extern_weak addrspace(1) externally_initialized global i32, align 4
-@var = addrspace(1) global i32 0, align 4
+; CHECK: @var = addrspace(1) global i32 poison, align 4
+@var = external addrspace(1) global i32, align 4
define amdgpu_kernel void @kernel() {
entry:
diff --git a/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll b/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll
index e4e68ae..e5bab0c 100644
--- a/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll
+++ b/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll
@@ -36,11 +36,10 @@ outlinedPath:
; These two uses of stack slots are overlapping. This should prevent
; merging of stack slots. CodeExtractor must replicate the effects of
; these markers in the caller to inhibit stack coloring.
- %gep1 = getelementptr inbounds i8, ptr %local1, i64 1
- call void @llvm.lifetime.start.p0(i64 1, ptr %gep1)
+ call void @llvm.lifetime.start.p0(i64 1, ptr %local1)
call void @llvm.lifetime.start.p0(i64 1, ptr %local2)
call void @cold_use2(ptr %local1, ptr %local2)
- call void @llvm.lifetime.end.p0(i64 1, ptr %gep1)
+ call void @llvm.lifetime.end.p0(i64 1, ptr %local1)
call void @llvm.lifetime.end.p0(i64 1, ptr %local2)
br i1 undef, label %outlinedPath2, label %outlinedPathExit
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/lifetime.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/lifetime.ll
index d39a0b3..053d073 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/lifetime.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/lifetime.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s
define i32 @lifetime_flat_pointer() {
@@ -5,18 +6,15 @@ define i32 @lifetime_flat_pointer() {
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[ALLOCA]])
; CHECK-NEXT: store i32 1, ptr addrspace(5) [[ALLOCA]], align 4
-; CHECK-NEXT: %ret = load i32, ptr addrspace(5) [[ALLOCA]], align 4
+; CHECK-NEXT: [[RET:%.*]] = load i32, ptr addrspace(5) [[ALLOCA]], align 4
; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[ALLOCA]])
-; CHECK-NEXT: ret i32 %ret
+; CHECK-NEXT: ret i32 [[RET]]
;
%alloca = alloca i32, align 4, addrspace(5)
%flat = addrspacecast ptr addrspace(5) %alloca to ptr
- call void @llvm.lifetime.start.p0(i64 4 , ptr %flat)
+ call void @llvm.lifetime.start(i64 4, ptr addrspace(5) %alloca)
store i32 1, ptr %flat, align 4
%ret = load i32, ptr %flat, align 4
- call void @llvm.lifetime.end.p0(i64 4 , ptr %flat)
+ call void @llvm.lifetime.end(i64 4, ptr addrspace(5) %alloca)
ret i32 %ret
}
-
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll
index 8bf6312..31e914a 100644
--- a/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -passes=infer-address-spaces %s | FileCheck %s
target triple = "nvptx64-nvidia-cuda"
@@ -6,20 +7,18 @@ define i32 @lifetime_flat_pointer() {
; CHECK-LABEL: define i32 @lifetime_flat_pointer() {
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[ALLOCA]] to ptr addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[ALLOCA]])
; CHECK-NEXT: store i32 1, ptr addrspace(5) [[TMP1]], align 4
-; CHECK-NEXT: %ret = load i32, ptr addrspace(5) [[TMP1]], align 4
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
-; CHECK-NEXT: ret i32 %ret
+; CHECK-NEXT: [[RET:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[ALLOCA]])
+; CHECK-NEXT: ret i32 [[RET]]
;
%alloca = alloca i32, align 4
%1 = addrspacecast ptr %alloca to ptr addrspace(5)
- %2 = addrspacecast ptr addrspace(5) %1 to ptr
- %3 = addrspacecast ptr addrspace(5) %1 to ptr
- call void @llvm.lifetime.start.p0(i64 4, ptr %2)
+ call void @llvm.lifetime.start.p0(i64 4, ptr %alloca)
store i32 1, ptr addrspace(5) %1, align 4
%ret = load i32, ptr addrspace(5) %1, align 4
- call void @llvm.lifetime.end.p0(i64 4, ptr %3)
+ call void @llvm.lifetime.end.p0(i64 4, ptr %alloca)
ret i32 %ret
}
diff --git a/llvm/test/Transforms/Inline/alloca-bonus.ll b/llvm/test/Transforms/Inline/alloca-bonus.ll
index 1dec660..45ff527 100644
--- a/llvm/test/Transforms/Inline/alloca-bonus.ll
+++ b/llvm/test/Transforms/Inline/alloca-bonus.ll
@@ -3,8 +3,6 @@
target datalayout = "p:32:32"
-declare void @llvm.lifetime.start.p0(i64 %size, ptr nocapture %ptr)
-
@glbl = external global i32
define void @outer1() {
@@ -20,7 +18,6 @@ define void @inner1(ptr %ptr) {
store i32 0, ptr %ptr
%D = getelementptr inbounds i32, ptr %ptr, i32 1
%F = select i1 false, ptr %ptr, ptr @glbl
- call void @llvm.lifetime.start.p0(i64 0, ptr %ptr)
call void @extern()
ret void
}
@@ -39,7 +36,6 @@ define void @inner2(ptr %ptr) {
store i32 0, ptr %ptr
%D = getelementptr inbounds i32, ptr %ptr, i32 %A
%F = select i1 false, ptr %ptr, ptr @glbl
- call void @llvm.lifetime.start.p0(i64 0, ptr %ptr)
call void @extern()
ret void
}
@@ -146,7 +142,6 @@ define void @inner5(i1 %flag, ptr %ptr) {
if.then:
%D = getelementptr inbounds i32, ptr %ptr, i32 %A
%F = select i1 false, ptr %ptr, ptr @glbl
- call void @llvm.lifetime.start.p0(i64 0, ptr %ptr)
ret void
exit:
diff --git a/llvm/test/Transforms/Inline/inlined-mustprogress-loop-metadata.ll b/llvm/test/Transforms/Inline/inlined-mustprogress-loop-metadata.ll
index 12a328d..4e13ff4 100644
--- a/llvm/test/Transforms/Inline/inlined-mustprogress-loop-metadata.ll
+++ b/llvm/test/Transforms/Inline/inlined-mustprogress-loop-metadata.ll
@@ -1,7 +1,22 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --force-update
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5
; RUN: opt < %s -S -passes="inline" | FileCheck %s
define void @callee(i32 %a, i32 %b) #0 {
+; CHECK: Function Attrs: mustprogress
+; CHECK-LABEL: define void @callee(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[FOR_COND:.*]]
+; CHECK: [[FOR_COND]]:
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: br label %[[WHILE_BODY:.*]]
+; CHECK: [[WHILE_BODY]]:
+; CHECK-NEXT: br label %[[WHILE_BODY]]
+;
entry:
br label %for.cond
for.cond:
@@ -17,20 +32,20 @@ while.body:
define void @caller(i32 %a, i32 %b) #1 {
; CHECK: Function Attrs: noinline
-; CHECK-LABEL: define {{[^@]+}}@caller
-; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1:#.*]] {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_COND:%.*]]
-; CHECK: for.cond:
+; CHECK-LABEL: define void @caller(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[FOR_COND:.*]]
+; CHECK: [[FOR_COND]]:
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: br label [[FOR_COND]]
-; CHECK: for.end:
-; CHECK-NEXT: br label [[FOR_COND_I:%.*]]
-; CHECK: for.cond.i:
- ; CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: callee.exit:
+; CHECK-NEXT: br i1 [[CMP]], label %[[CALLEE_EXIT:.*]], label %[[FOR_END:.*]]
+; CHECK: [[CALLEE_EXIT]]:
+; CHECK-NEXT: br label %[[FOR_COND]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: br label %[[FOR_COND_I:.*]]
+; CHECK: [[FOR_COND_I]]:
+; CHECK-NEXT: br label %[[FOR_COND_I]], !llvm.loop [[LOOP0]]
+; CHECK: [[CALLEE_EXIT1:.*:]]
; CHECK-NEXT: ret void
;
entry:
@@ -46,6 +61,20 @@ for.end:
}
define void @callee_no_metadata(i32 %a, i32 %b) {
+; CHECK-LABEL: define void @callee_no_metadata(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[FOR_COND:.*]]
+; CHECK: [[FOR_COND]]:
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: br label %[[FOR_COND]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: br label %[[WHILE_BODY:.*]]
+; CHECK: [[WHILE_BODY]]:
+; CHECK-NEXT: br label %[[WHILE_BODY]]
+;
entry:
br label %for.cond
for.cond:
@@ -60,20 +89,20 @@ while.body:
}
define void @caller_no_metadata(i32 %a, i32 %b) {
-; CHECK-LABEL: define {{[^@]+}}@caller_no_metadata
-; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_COND:%.*]]
-; CHECK: for.cond:
+; CHECK-LABEL: define void @caller_no_metadata(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[FOR_COND:.*]]
+; CHECK: [[FOR_COND]]:
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: br label [[FOR_COND]]
-; CHECK: for.end:
-; CHECK-NEXT: br label [[FOR_COND_I:%.*]]
-; CHECK: for.cond.i:
-; CHECK-NEXT: br label [[FOR_COND_I]]
-; CHECK: callee_no_metadata.exit:
+; CHECK-NEXT: br i1 [[CMP]], label %[[CALLEE_NO_METADATA_EXIT:.*]], label %[[FOR_END:.*]]
+; CHECK: [[CALLEE_NO_METADATA_EXIT]]:
+; CHECK-NEXT: br label %[[FOR_COND]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: br label %[[FOR_COND_I:.*]]
+; CHECK: [[FOR_COND_I]]:
+; CHECK-NEXT: br label %[[FOR_COND_I]]
+; CHECK: [[CALLEE_NO_METADATA_EXIT1:.*:]]
; CHECK-NEXT: ret void
;
entry:
@@ -89,6 +118,21 @@ for.end:
}
define void @callee_mustprogress(i32 %a, i32 %b) #0 {
+; CHECK: Function Attrs: mustprogress
+; CHECK-LABEL: define void @callee_mustprogress(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[FOR_COND:.*]]
+; CHECK: [[FOR_COND]]:
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: br label %[[FOR_COND]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: br label %[[WHILE_BODY:.*]]
+; CHECK: [[WHILE_BODY]]:
+; CHECK-NEXT: br label %[[WHILE_BODY]]
+;
entry:
br label %for.cond
for.cond:
@@ -104,20 +148,20 @@ while.body:
define void @caller_mustprogress(i32 %a, i32 %b) #0 {
; CHECK: Function Attrs: mustprogress
-; CHECK-LABEL: define {{[^@]+}}@caller_mustprogress
-; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR0:#[0-9]+]] {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_COND:%.*]]
-; CHECK: for.cond:
+; CHECK-LABEL: define void @caller_mustprogress(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[FOR_COND:.*]]
+; CHECK: [[FOR_COND]]:
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: br label [[FOR_COND]]
-; CHECK: for.end:
-; CHECK-NEXT: br label [[FOR_COND_I:%.*]]
-; CHECK: for.cond.i:
-; CHECK-NEXT: br label [[FOR_COND_I]]
-; CHECK: callee_mustprogress.exit:
+; CHECK-NEXT: br i1 [[CMP]], label %[[CALLEE_MUSTPROGRESS_EXIT:.*]], label %[[FOR_END:.*]]
+; CHECK: [[CALLEE_MUSTPROGRESS_EXIT]]:
+; CHECK-NEXT: br label %[[FOR_COND]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: br label %[[FOR_COND_I:.*]]
+; CHECK: [[FOR_COND_I]]:
+; CHECK-NEXT: br label %[[FOR_COND_I]]
+; CHECK: [[CALLEE_MUSTPROGRESS_EXIT1:.*:]]
; CHECK-NEXT: ret void
;
entry:
@@ -133,20 +177,20 @@ for.end:
}
define void @caller_mustprogress_callee_no_metadata(i32 %a, i32 %b) #0 {
-; CHECK-LABEL: define {{[^@]+}}@caller_mustprogress_callee_no_metadata
-; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_COND:%.*]]
-; CHECK: for.cond:
+; CHECK-LABEL: define void @caller_mustprogress_callee_no_metadata(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[FOR_COND:.*]]
+; CHECK: [[FOR_COND]]:
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: br label [[FOR_COND]]
-; CHECK: for.end:
-; CHECK-NEXT: br label [[FOR_COND_I:%.*]]
-; CHECK: for.cond.i:
-; CHECK-NEXT: br label [[FOR_COND_I]]
-; CHECK: callee_no_metadata.exit:
+; CHECK-NEXT: br i1 [[CMP]], label %[[CALLEE_NO_METADATA_EXIT:.*]], label %[[FOR_END:.*]]
+; CHECK: [[CALLEE_NO_METADATA_EXIT]]:
+; CHECK-NEXT: br label %[[FOR_COND]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: br label %[[FOR_COND_I:.*]]
+; CHECK: [[FOR_COND_I]]:
+; CHECK-NEXT: br label %[[FOR_COND_I]]
+; CHECK: [[CALLEE_NO_METADATA_EXIT1:.*:]]
; CHECK-NEXT: ret void
;
entry:
@@ -162,6 +206,42 @@ for.end:
}
define void @callee_multiple(i32 %a, i32 %b) #0 {
+; CHECK: Function Attrs: mustprogress
+; CHECK-LABEL: define void @callee_multiple(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+; CHECK-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
+; CHECK-NEXT: br label %[[FOR_COND:.*]]
+; CHECK: [[FOR_COND]]:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: store i32 0, ptr [[I]], align 4
+; CHECK-NEXT: br label %[[FOR_COND1:.*]]
+; CHECK: [[FOR_COND1]]:
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP2]], 10
+; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY3:.*]], label %[[FOR_END4:.*]]
+; CHECK: [[FOR_BODY3]]:
+; CHECK-NEXT: br label %[[FOR_INC:.*]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4
+; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1
+; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4
+; CHECK-NEXT: br label %[[FOR_COND1]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[FOR_END4]]:
+; CHECK-NEXT: br label %[[WHILE_BODY:.*]]
+; CHECK: [[WHILE_BODY]]:
+; CHECK-NEXT: br label %[[WHILE_BODY]]
+;
entry:
%a.addr = alloca i32, align 4
%b.addr = alloca i32, align 4
@@ -198,9 +278,9 @@ while.body:
define void @caller_multiple(i32 %a, i32 %b) #1 {
; CHECK: Function Attrs: noinline
-; CHECK-LABEL: define {{[^@]+}}@caller_multiple
-; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] {
-; CHECK-NEXT: entry:
+; CHECK-LABEL: define void @caller_multiple(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[I_I:%.*]] = alloca i32, align 4
@@ -209,59 +289,59 @@ define void @caller_multiple(i32 %a, i32 %b) #1 {
; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
; CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
; CHECK-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
-; CHECK-NEXT: br label [[FOR_COND:%.*]]
-; CHECK: for.cond:
+; CHECK-NEXT: br label %[[FOR_COND:.*]]
+; CHECK: [[FOR_COND]]:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: br label [[FOR_COND]]
-; CHECK: for.end:
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: br label %[[FOR_COND]]
+; CHECK: [[FOR_END]]:
; CHECK-NEXT: store i32 0, ptr [[I]], align 4
-; CHECK-NEXT: br label [[FOR_COND1:%.*]]
-; CHECK: for.cond1:
+; CHECK-NEXT: br label %[[FOR_COND1:.*]]
+; CHECK: [[FOR_COND1]]:
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP2]], 10
-; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3:%.*]], label [[FOR_END4:%.*]]
-; CHECK: for.body3:
-; CHECK-NEXT: br label [[FOR_INC:%.*]]
-; CHECK: for.inc:
+; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY3:.*]], label %[[FOR_END4:.*]]
+; CHECK: [[FOR_BODY3]]:
+; CHECK-NEXT: br label %[[FOR_INC:.*]]
+; CHECK: [[FOR_INC]]:
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4
; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1
; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4
-; CHECK-NEXT: br label [[FOR_COND1]]
-; CHECK: for.end4:
+; CHECK-NEXT: br label %[[FOR_COND1]]
+; CHECK: [[FOR_END4]]:
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[A_ADDR_I]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[B_ADDR_I]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I_I]])
; CHECK-NEXT: store i32 0, ptr [[A_ADDR_I]], align 4
; CHECK-NEXT: store i32 5, ptr [[B_ADDR_I]], align 4
-; CHECK-NEXT: br label [[FOR_COND_I:%.*]]
-; CHECK: for.cond.i:
-; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
-; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP7]], [[TMP8]]
-; CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[FOR_END_I:%.*]]
-; CHECK: for.body.i:
- ; CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end.i:
+; CHECK-NEXT: br label %[[FOR_COND_I:.*]]
+; CHECK: [[FOR_COND_I]]:
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]]
+; CHECK-NEXT: br i1 [[CMP_I]], label %[[FOR_BODY_I:.*]], label %[[FOR_END_I:.*]]
+; CHECK: [[FOR_BODY_I]]:
+; CHECK-NEXT: br label %[[FOR_COND_I]], !llvm.loop [[LOOP2]]
+; CHECK: [[FOR_END_I]]:
; CHECK-NEXT: store i32 0, ptr [[I_I]], align 4
-; CHECK-NEXT: br label [[FOR_COND1_I:%.*]]
-; CHECK: for.cond1.i:
-; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[I_I]], align 4
-; CHECK-NEXT: [[CMP2_I:%.*]] = icmp slt i32 [[TMP9]], 10
-; CHECK-NEXT: br i1 [[CMP2_I]], label [[FOR_BODY3_I:%.*]], label [[FOR_END4_I:%.*]]
-; CHECK: for.body3.i:
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[I_I]], align 4
-; CHECK-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP10]], 1
+; CHECK-NEXT: br label %[[FOR_COND1_I:.*]]
+; CHECK: [[FOR_COND1_I]]:
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[I_I]], align 4
+; CHECK-NEXT: [[CMP2_I:%.*]] = icmp slt i32 [[TMP6]], 10
+; CHECK-NEXT: br i1 [[CMP2_I]], label %[[FOR_BODY3_I:.*]], label %[[CALLEE_MULTIPLE_EXIT:.*]]
+; CHECK: [[FOR_BODY3_I]]:
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[I_I]], align 4
+; CHECK-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP7]], 1
; CHECK-NEXT: store i32 [[INC_I]], ptr [[I_I]], align 4
-; CHECK-NEXT: br label [[FOR_COND1_I]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: for.end4.i:
-; CHECK-NEXT: br label [[WHILE_BODY_I:%.*]]
-; CHECK: while.body.i:
-; CHECK-NEXT: br label [[WHILE_BODY_I]]
-; CHECK: callee_multiple.exit:
+; CHECK-NEXT: br label %[[FOR_COND1_I]], !llvm.loop [[LOOP3]]
+; CHECK: [[CALLEE_MULTIPLE_EXIT]]:
+; CHECK-NEXT: br label %[[WHILE_BODY_I:.*]]
+; CHECK: [[WHILE_BODY_I]]:
+; CHECK-NEXT: br label %[[WHILE_BODY_I]]
+; CHECK: [[CALLEE_MULTIPLE_EXIT1:.*:]]
; CHECK-NEXT: ret void
;
entry:
@@ -298,6 +378,51 @@ for.end4:
}
define void @callee_nested(i32 %a, i32 %b) #0 {
+; CHECK: Function Attrs: mustprogress
+; CHECK-LABEL: define void @callee_nested(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+; CHECK-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
+; CHECK-NEXT: br label %[[FOR_COND:.*]]
+; CHECK: [[FOR_COND]]:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP0]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: store i32 0, ptr [[I]], align 4
+; CHECK-NEXT: br label %[[FOR_COND1:.*]]
+; CHECK: [[FOR_COND1]]:
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP2]], 10
+; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY3:.*]], label %[[FOR_END8:.*]]
+; CHECK: [[FOR_BODY3]]:
+; CHECK-NEXT: br label %[[FOR_COND4:.*]]
+; CHECK: [[FOR_COND4]]:
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; CHECK-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT: br i1 [[CMP5]], label %[[FOR_BODY6:.*]], label %[[FOR_END7:.*]]
+; CHECK: [[FOR_BODY6]]:
+; CHECK-NEXT: br label %[[FOR_COND4]], !llvm.loop [[LOOP2]]
+; CHECK: [[FOR_END7]]:
+; CHECK-NEXT: br label %[[FOR_INC:.*]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1
+; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4
+; CHECK-NEXT: br label %[[FOR_COND1]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[FOR_END8]]:
+; CHECK-NEXT: br label %[[WHILE_BODY:.*]]
+; CHECK: [[WHILE_BODY]]:
+; CHECK-NEXT: br label %[[WHILE_BODY]]
+;
entry:
%a.addr = alloca i32, align 4
%b.addr = alloca i32, align 4
@@ -343,9 +468,9 @@ while.body:
define void @caller_nested(i32 %a, i32 %b) #1 {
; CHECK: Function Attrs: noinline
-; CHECK-LABEL: define {{[^@]+}}@caller_nested
-; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] {
-; CHECK-NEXT: entry:
+; CHECK-LABEL: define void @caller_nested(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[I_I:%.*]] = alloca i32, align 4
@@ -355,91 +480,91 @@ define void @caller_nested(i32 %a, i32 %b) #1 {
; CHECK-NEXT: [[I9:%.*]] = alloca i32, align 4
; CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
; CHECK-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
-; CHECK-NEXT: br label [[FOR_COND:%.*]]
-; CHECK: for.cond:
+; CHECK-NEXT: br label %[[FOR_COND:.*]]
+; CHECK: [[FOR_COND]]:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END8:%.*]]
-; CHECK: for.body:
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END8:.*]]
+; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: store i32 0, ptr [[I]], align 4
-; CHECK-NEXT: br label [[FOR_COND1:%.*]]
-; CHECK: for.cond1:
+; CHECK-NEXT: br label %[[FOR_COND1:.*]]
+; CHECK: [[FOR_COND1]]:
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP2]], 10
-; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3:%.*]], label [[FOR_END7:%.*]]
-; CHECK: for.body3:
-; CHECK-NEXT: br label [[FOR_COND4:%.*]]
-; CHECK: for.cond4:
+; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY3:.*]], label %[[FOR_END7:.*]]
+; CHECK: [[FOR_BODY3]]:
+; CHECK-NEXT: br label %[[FOR_COND4:.*]]
+; CHECK: [[FOR_COND4]]:
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4
; CHECK-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT: br i1 [[CMP5]], label [[FOR_BODY6:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body6:
-; CHECK-NEXT: br label [[FOR_COND4]]
-; CHECK: for.end:
-; CHECK-NEXT: br label [[FOR_INC:%.*]]
-; CHECK: for.inc:
+; CHECK-NEXT: br i1 [[CMP5]], label %[[FOR_BODY6:.*]], label %[[FOR_END:.*]]
+; CHECK: [[FOR_BODY6]]:
+; CHECK-NEXT: br label %[[FOR_COND4]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: br label %[[FOR_INC:.*]]
+; CHECK: [[FOR_INC]]:
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4
; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1
; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4
-; CHECK-NEXT: br label [[FOR_COND1]]
-; CHECK: for.end7:
-; CHECK-NEXT: br label [[FOR_COND]]
-; CHECK: for.end8:
+; CHECK-NEXT: br label %[[FOR_COND1]]
+; CHECK: [[FOR_END7]]:
+; CHECK-NEXT: br label %[[FOR_COND]]
+; CHECK: [[FOR_END8]]:
; CHECK-NEXT: store i32 0, ptr [[I9]], align 4
-; CHECK-NEXT: br label [[FOR_COND10:%.*]]
-; CHECK: for.cond10:
+; CHECK-NEXT: br label %[[FOR_COND10:.*]]
+; CHECK: [[FOR_COND10]]:
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[I9]], align 4
; CHECK-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP6]], 10
-; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY12:%.*]], label [[FOR_END15:%.*]]
-; CHECK: for.body12:
-; CHECK-NEXT: br label [[FOR_INC13:%.*]]
-; CHECK: for.inc13:
+; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY12:.*]], label %[[FOR_END15:.*]]
+; CHECK: [[FOR_BODY12]]:
+; CHECK-NEXT: br label %[[FOR_INC13:.*]]
+; CHECK: [[FOR_INC13]]:
; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[I9]], align 4
; CHECK-NEXT: [[INC14:%.*]] = add nsw i32 [[TMP7]], 1
; CHECK-NEXT: store i32 [[INC14]], ptr [[I9]], align 4
-; CHECK-NEXT: br label [[FOR_COND10]]
-; CHECK: for.end15:
+; CHECK-NEXT: br label %[[FOR_COND10]]
+; CHECK: [[FOR_END15]]:
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[A_ADDR_I]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[B_ADDR_I]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I_I]])
; CHECK-NEXT: store i32 0, ptr [[A_ADDR_I]], align 4
; CHECK-NEXT: store i32 5, ptr [[B_ADDR_I]], align 4
-; CHECK-NEXT: br label [[FOR_COND_I:%.*]]
-; CHECK: for.cond.i:
-; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
-; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[FOR_END_I:%.*]]
-; CHECK: for.body.i:
-; CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP0]]
-; CHECK: for.end.i:
+; CHECK-NEXT: br label %[[FOR_COND_I:.*]]
+; CHECK: [[FOR_COND_I]]:
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: br i1 [[CMP_I]], label %[[FOR_BODY_I:.*]], label %[[FOR_END_I:.*]]
+; CHECK: [[FOR_BODY_I]]:
+; CHECK-NEXT: br label %[[FOR_COND_I]], !llvm.loop [[LOOP0]]
+; CHECK: [[FOR_END_I]]:
; CHECK-NEXT: store i32 0, ptr [[I_I]], align 4
-; CHECK-NEXT: br label [[FOR_COND1_I:%.*]]
-; CHECK: for.cond1.i:
+; CHECK-NEXT: br label %[[FOR_COND1_I:.*]]
+; CHECK: [[FOR_COND1_I]]:
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[I_I]], align 4
+; CHECK-NEXT: [[CMP2_I:%.*]] = icmp slt i32 [[TMP10]], 10
+; CHECK-NEXT: br i1 [[CMP2_I]], label %[[FOR_BODY3_I:.*]], label %[[CALLEE_NESTED_EXIT:.*]]
+; CHECK: [[FOR_BODY3_I]]:
+; CHECK-NEXT: br label %[[FOR_COND4_I:.*]]
+; CHECK: [[FOR_COND4_I]]:
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
+; CHECK-NEXT: [[CMP5_I:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT: br i1 [[CMP5_I]], label %[[FOR_BODY6_I:.*]], label %[[FOR_END7_I:.*]]
+; CHECK: [[FOR_BODY6_I]]:
+; CHECK-NEXT: br label %[[FOR_COND4_I]], !llvm.loop [[LOOP2]]
+; CHECK: [[FOR_END7_I]]:
; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_I]], align 4
-; CHECK-NEXT: [[CMP2_I:%.*]] = icmp slt i32 [[TMP13]], 10
-; CHECK-NEXT: br i1 [[CMP2_I]], label [[FOR_BODY3_I:%.*]], label [[FOR_END8_I:%.*]]
-; CHECK: for.body3.i:
-; CHECK-NEXT: br label [[FOR_COND4_I:%.*]]
-; CHECK: for.cond4.i:
-; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[B_ADDR_I]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[A_ADDR_I]], align 4
-; CHECK-NEXT: [[CMP5_I:%.*]] = icmp slt i32 [[TMP14]], [[TMP15]]
-; CHECK-NEXT: br i1 [[CMP5_I]], label [[FOR_BODY6_I:%.*]], label [[FOR_END7_I:%.*]]
-; CHECK: for.body6.i:
- ; CHECK-NEXT: br label [[FOR_COND4_I]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end7.i:
-; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[I_I]], align 4
-; CHECK-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP16]], 1
+; CHECK-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP13]], 1
; CHECK-NEXT: store i32 [[INC_I]], ptr [[I_I]], align 4
-; CHECK-NEXT: br label [[FOR_COND1_I]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: for.end8.i:
-; CHECK-NEXT: br label [[WHILE_BODY_I:%.*]]
-; CHECK: while.body.i:
-; CHECK-NEXT: br label [[WHILE_BODY_I]]
-; CHECK: callee_nested.exit:
+; CHECK-NEXT: br label %[[FOR_COND1_I]], !llvm.loop [[LOOP4]]
+; CHECK: [[CALLEE_NESTED_EXIT]]:
+; CHECK-NEXT: br label %[[WHILE_BODY_I:.*]]
+; CHECK: [[WHILE_BODY_I]]:
+; CHECK-NEXT: br label %[[WHILE_BODY_I]]
+; CHECK: [[CALLEE_NESTED_EXIT1:.*:]]
; CHECK-NEXT: ret void
;
entry:
@@ -499,14 +624,7 @@ for.end15:
ret void
}
-; CHECK: attributes [[ATTR0]] = { mustprogress }
-; CHECK: attributes [[ATTR1]] = { noinline }
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[GEN1:!.*]]}
-; CHECK: [[GEN1]] = !{!"llvm.loop.mustprogress"}
-; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[GEN1:!.*]]}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[GEN1:!.*]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[GEN1:!.*]]}
attributes #0 = { mustprogress }
attributes #1 = { noinline }
@@ -520,3 +638,10 @@ attributes #2 = { noinline mustprogress }
!5 = distinct !{!5, !1}
!6 = distinct !{!6, !1}
!7 = distinct !{!7, !1}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/Inline/redundant-loads.ll b/llvm/test/Transforms/Inline/redundant-loads.ll
index 773be78..3b066ef 100644
--- a/llvm/test/Transforms/Inline/redundant-loads.ll
+++ b/llvm/test/Transforms/Inline/redundant-loads.ll
@@ -104,11 +104,8 @@ define void @outer6(ptr %a, ptr %ptr) {
ret void
}
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) argmemonly nounwind
-
define void @inner6(ptr %a, ptr %ptr) {
%1 = load i32, ptr %a
- call void @llvm.lifetime.start.p0(i64 32, ptr %ptr) ; This intrinsic does not clobber the first load.
%2 = load i32, ptr %a
call void @pad()
%3 = load i32, ptr %a
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll b/llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll
new file mode 100644
index 0000000..d255eb0
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll
@@ -0,0 +1,158 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s
+
+; ------------------------------------------------------------------------------------
+; Incorrect signature for format cases (IR vector too large) wmma.f32.16x16x128.f8f6f4
+; ------------------------------------------------------------------------------------
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: ret void
+;
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 2, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: ret void
+;
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> [[TMP0]], i32 1, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: ret void
+;
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 3, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: ret void
+;
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 3, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: ret void
+;
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: ret void
+;
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8(
+; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: ret void
+;
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: ret void
+;
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6(
+; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> [[TMP0]], i32 2, <12 x i32> [[TMP1]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: ret void
+;
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> [[TMP0]], i32 4, <8 x i32> [[TMP1]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: ret void
+;
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/abs-intrinsic.ll b/llvm/test/Transforms/InstCombine/abs-intrinsic.ll
index 022d60d..d32f0e4 100644
--- a/llvm/test/Transforms/InstCombine/abs-intrinsic.ll
+++ b/llvm/test/Transforms/InstCombine/abs-intrinsic.ll
@@ -229,7 +229,7 @@ define i32 @abs_of_neg(i32 %x) {
define <4 x i32> @abs_of_neg_vec(<4 x i32> %x) {
; CHECK-LABEL: @abs_of_neg_vec(
-; CHECK-NEXT: [[B:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[X:%.*]], i1 false)
+; CHECK-NEXT: [[B:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[X:%.*]], i1 true)
; CHECK-NEXT: ret <4 x i32> [[B]]
;
%a = sub nsw <4 x i32> zeroinitializer, %x
diff --git a/llvm/test/Transforms/InstCombine/deadcode.ll b/llvm/test/Transforms/InstCombine/deadcode.ll
index e65f0ab..f3e1ba6 100644
--- a/llvm/test/Transforms/InstCombine/deadcode.ll
+++ b/llvm/test/Transforms/InstCombine/deadcode.ll
@@ -26,8 +26,9 @@ declare void @llvm.lifetime.start.p0(i64, ptr)
declare void @llvm.lifetime.end.p0(i64, ptr)
define void @test3() {
- call void @llvm.lifetime.start.p0(i64 -1, ptr undef)
- call void @llvm.lifetime.end.p0(i64 -1, ptr undef)
+ %a = alloca i32
+ call void @llvm.lifetime.start.p0(i64 -1, ptr %a)
+ call void @llvm.lifetime.end.p0(i64 -1, ptr %a)
ret void
}
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 752ff0c..bb0a94c 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -682,15 +682,15 @@ define i32 @test28() nounwind {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ORIENTATIONS:%.*]] = alloca [1 x [1 x %struct.x]], align 8
; CHECK-NEXT: [[T3:%.*]] = call i32 @puts(ptr noundef nonnull dereferenceable(1) @.str) #[[ATTR0]]
-; CHECK-NEXT: [[T45:%.*]] = getelementptr inbounds nuw i8, ptr [[ORIENTATIONS]], i64 1
; CHECK-NEXT: br label [[BB10:%.*]]
; CHECK: bb10:
; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[BB10]] ]
; CHECK-NEXT: [[T12_REC:%.*]] = xor i32 [[INDVAR]], -1
; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[T12_REC]] to i64
-; CHECK-NEXT: [[T12:%.*]] = getelementptr inbounds i8, ptr [[T45]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], 1
+; CHECK-NEXT: [[T12:%.*]] = getelementptr inbounds i8, ptr [[ORIENTATIONS]], i64 [[TMP1]]
; CHECK-NEXT: [[T16:%.*]] = call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str1, ptr nonnull [[T12]]) #[[ATTR0]]
-; CHECK-NEXT: [[T84:%.*]] = icmp eq i32 [[INDVAR]], 0
+; CHECK-NEXT: [[T84:%.*]] = icmp eq i64 [[TMP1]], 0
; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
; CHECK-NEXT: br i1 [[T84]], label [[BB17:%.*]], label [[BB10]]
; CHECK: bb17:
diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index 3f10405..aede844 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -849,3 +849,279 @@ define i1 @gep_mugtiple_ugt_inbounds_nusw(ptr %base, i64 %idx, i64 %idx2) {
%cmp = icmp ugt ptr %gep2, %base
ret i1 %cmp
}
+
+define i1 @gep_multiple_multi_use_below_limit(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3) {
+; CHECK-LABEL: @gep_multiple_multi_use_below_limit(
+; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl i64 [[IDX3:%.*]], 2
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2:%.*]], i64 [[GEP3_IDX]]
+; CHECK-NEXT: call void @use(ptr [[GEP3]])
+; CHECK-NEXT: [[GEP2_IDX:%.*]] = shl i64 [[IDX2:%.*]], 2
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i8, ptr [[GEP3]], i64 [[GEP2_IDX]]
+; CHECK-NEXT: call void @use(ptr [[GEP4]])
+; CHECK-NEXT: [[GEP3_IDX1:%.*]] = shl i64 [[IDX4:%.*]], 2
+; CHECK-NEXT: [[GEP5:%.*]] = getelementptr i8, ptr [[GEP4]], i64 [[GEP3_IDX1]]
+; CHECK-NEXT: call void @use(ptr [[GEP5]])
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[GEP3_IDX]], [[GEP2_IDX]]
+; CHECK-NEXT: [[TMP2:%.*]] = sub i64 0, [[GEP3_IDX1]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep1 = getelementptr i32, ptr %base, i64 %idx1
+ call void @use(ptr %gep1)
+ %gep2 = getelementptr i32, ptr %gep1, i64 %idx2
+ call void @use(ptr %gep2)
+ %gep3 = getelementptr i32, ptr %gep2, i64 %idx3
+ call void @use(ptr %gep3)
+ %cmp = icmp eq ptr %gep3, %base
+ ret i1 %cmp
+}
+
+define i1 @gep_multiple_multi_use_below_limit_extra_one_use_gep1(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
+; CHECK-LABEL: @gep_multiple_multi_use_below_limit_extra_one_use_gep1(
+; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl i64 [[IDX1:%.*]], 2
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[GEP1_IDX]]
+; CHECK-NEXT: call void @use(ptr [[GEP1]])
+; CHECK-NEXT: [[GEP2_IDX:%.*]] = shl i64 [[IDX2:%.*]], 2
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 [[GEP2_IDX]]
+; CHECK-NEXT: call void @use(ptr [[GEP2]])
+; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl i64 [[IDX3:%.*]], 2
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[GEP3_IDX]]
+; CHECK-NEXT: call void @use(ptr [[GEP3]])
+; CHECK-NEXT: [[GEP4_IDX_NEG:%.*]] = mul i64 [[IDX4:%.*]], -4
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[GEP1_IDX]], [[GEP2_IDX]]
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], [[GEP3_IDX]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP2]], [[GEP4_IDX_NEG]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep1 = getelementptr i32, ptr %base, i64 %idx1
+ call void @use(ptr %gep1)
+ %gep2 = getelementptr i32, ptr %gep1, i64 %idx2
+ call void @use(ptr %gep2)
+ %gep3 = getelementptr i32, ptr %gep2, i64 %idx3
+ call void @use(ptr %gep3)
+ %gep4 = getelementptr i32, ptr %gep3, i64 %idx4
+ %cmp = icmp eq ptr %gep4, %base
+ ret i1 %cmp
+}
+
+define i1 @gep_multiple_multi_use_below_limit_extra_one_use_gep2(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
+; CHECK-LABEL: @gep_multiple_multi_use_below_limit_extra_one_use_gep2(
+; CHECK-NEXT: [[GEP1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]]
+; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[GEP1_IDX1]], 2
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]]
+; CHECK-NEXT: call void @use(ptr [[GEP2]])
+; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl i64 [[IDX3:%.*]], 2
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[GEP3_IDX]]
+; CHECK-NEXT: call void @use(ptr [[GEP3]])
+; CHECK-NEXT: [[GEP4_IDX:%.*]] = shl i64 [[IDX4:%.*]], 2
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i8, ptr [[GEP3]], i64 [[GEP4_IDX]]
+; CHECK-NEXT: call void @use(ptr [[GEP4]])
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], [[GEP3_IDX]]
+; CHECK-NEXT: [[GEP4_IDX_NEG:%.*]] = sub i64 0, [[GEP4_IDX]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP2]], [[GEP4_IDX_NEG]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep1 = getelementptr i32, ptr %base, i64 %idx1
+ %gep2 = getelementptr i32, ptr %gep1, i64 %idx2
+ call void @use(ptr %gep2)
+ %gep3 = getelementptr i32, ptr %gep2, i64 %idx3
+ call void @use(ptr %gep3)
+ %gep4 = getelementptr i32, ptr %gep3, i64 %idx4
+ call void @use(ptr %gep4)
+ %cmp = icmp eq ptr %gep4, %base
+ ret i1 %cmp
+}
+
+define i1 @gep_multiple_multi_above_below_limit_consts(ptr %base, i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: @gep_multiple_multi_above_below_limit_consts(
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 16
+; CHECK-NEXT: call void @use(ptr [[GEP1]])
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[GEP1]], i64 [[IDX1:%.*]]
+; CHECK-NEXT: call void @use(ptr [[GEP2]])
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2]], i64 16
+; CHECK-NEXT: call void @use(ptr [[GEP3]])
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i32, ptr [[GEP3]], i64 [[IDX2:%.*]]
+; CHECK-NEXT: call void @use(ptr [[GEP4]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[GEP4]], [[BASE]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep1 = getelementptr i32, ptr %base, i64 4
+ call void @use(ptr %gep1)
+ %gep2 = getelementptr i32, ptr %gep1, i64 %idx1
+ call void @use(ptr %gep2)
+ %gep3 = getelementptr i32, ptr %gep2, i64 4
+ call void @use(ptr %gep3)
+ %gep4 = getelementptr i32, ptr %gep3, i64 %idx2
+ call void @use(ptr %gep4)
+ %cmp = icmp eq ptr %gep4, %base
+ ret i1 %cmp
+}
+
+define i1 @gep_multiple_multi_use_above_limit(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
+; CHECK-LABEL: @gep_multiple_multi_use_above_limit(
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[IDX1:%.*]]
+; CHECK-NEXT: call void @use(ptr [[GEP4]])
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i32, ptr [[GEP4]], i64 [[IDX2:%.*]]
+; CHECK-NEXT: call void @use(ptr [[GEP3]])
+; CHECK-NEXT: [[GEP5:%.*]] = getelementptr i32, ptr [[GEP3]], i64 [[IDX3:%.*]]
+; CHECK-NEXT: call void @use(ptr [[GEP5]])
+; CHECK-NEXT: [[GEP6:%.*]] = getelementptr i32, ptr [[GEP5]], i64 [[IDX4:%.*]]
+; CHECK-NEXT: call void @use(ptr [[GEP6]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[GEP6]], [[BASE]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep1 = getelementptr i32, ptr %base, i64 %idx1
+ call void @use(ptr %gep1)
+ %gep2 = getelementptr i32, ptr %gep1, i64 %idx2
+ call void @use(ptr %gep2)
+ %gep3 = getelementptr i32, ptr %gep2, i64 %idx3
+ call void @use(ptr %gep3)
+ %gep4 = getelementptr i32, ptr %gep3, i64 %idx4
+ call void @use(ptr %gep4)
+ %cmp = icmp eq ptr %gep4, %base
+ ret i1 %cmp
+}
+
+define i1 @gep_gep_multiple_eq(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
+; CHECK-LABEL: @gep_gep_multiple_eq(
+; CHECK-NEXT: [[GEP1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]]
+; CHECK-NEXT: [[GEP3_IDX2:%.*]] = add i64 [[IDX3:%.*]], [[IDX4:%.*]]
+; CHECK-NEXT: [[CMP_UNSHIFTED:%.*]] = xor i64 [[GEP1_IDX1]], [[GEP3_IDX2]]
+; CHECK-NEXT: [[CMP_MASK:%.*]] = and i64 [[CMP_UNSHIFTED]], 4611686018427387903
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[CMP_MASK]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep1 = getelementptr i32, ptr %base, i64 %idx1
+ %gep2 = getelementptr i32, ptr %gep1, i64 %idx2
+ %gep3 = getelementptr i32, ptr %base, i64 %idx3
+ %gep4 = getelementptr i32, ptr %gep3, i64 %idx4
+ %cmp = icmp eq ptr %gep2, %gep4
+ ret i1 %cmp
+}
+
+define i1 @gep_gep_multiple_eq_nuw(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
+; CHECK-LABEL: @gep_gep_multiple_eq_nuw(
+; CHECK-NEXT: [[GEP1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]]
+; CHECK-NEXT: [[GEP3_IDX2:%.*]] = add i64 [[IDX3:%.*]], [[IDX4:%.*]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[GEP1_IDX1]], [[GEP3_IDX2]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep1 = getelementptr nuw i32, ptr %base, i64 %idx1
+ %gep2 = getelementptr nuw i32, ptr %gep1, i64 %idx2
+ %gep3 = getelementptr nuw i32, ptr %base, i64 %idx3
+ %gep4 = getelementptr nuw i32, ptr %gep3, i64 %idx4
+ %cmp = icmp eq ptr %gep2, %gep4
+ ret i1 %cmp
+}
+
+define i1 @gep_gep_multiple_eq_nuw_different_scales(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
+; CHECK-LABEL: @gep_gep_multiple_eq_nuw_different_scales(
+; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nuw i64 [[IDX1:%.*]], 2
+; CHECK-NEXT: [[GEP2_IDX:%.*]] = shl nuw i64 [[IDX2:%.*]], 3
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[GEP1_IDX]], [[GEP2_IDX]]
+; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl nuw i64 [[IDX3:%.*]], 2
+; CHECK-NEXT: [[GEP4_IDX:%.*]] = shl nuw i64 [[IDX4:%.*]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[GEP3_IDX]], [[GEP4_IDX]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep1 = getelementptr nuw i32, ptr %base, i64 %idx1
+ %gep2 = getelementptr nuw i64, ptr %gep1, i64 %idx2
+ %gep3 = getelementptr nuw i32, ptr %base, i64 %idx3
+ %gep4 = getelementptr nuw i64, ptr %gep3, i64 %idx4
+ %cmp = icmp eq ptr %gep2, %gep4
+ ret i1 %cmp
+}
+
+define i1 @gep_gep_multiple_eq_partial_nuw_different_scales(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
+; CHECK-LABEL: @gep_gep_multiple_eq_partial_nuw_different_scales(
+; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nuw i64 [[IDX1:%.*]], 2
+; CHECK-NEXT: [[GEP2_IDX:%.*]] = shl nuw i64 [[IDX2:%.*]], 3
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[GEP1_IDX]], [[GEP2_IDX]]
+; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl nuw i64 [[IDX3:%.*]], 2
+; CHECK-NEXT: [[GEP4_IDX:%.*]] = shl i64 [[IDX4:%.*]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[GEP3_IDX]], [[GEP4_IDX]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep1 = getelementptr nuw i32, ptr %base, i64 %idx1
+ %gep2 = getelementptr nuw i64, ptr %gep1, i64 %idx2
+ %gep3 = getelementptr nuw i32, ptr %base, i64 %idx3
+ %gep4 = getelementptr i64, ptr %gep3, i64 %idx4
+ %cmp = icmp eq ptr %gep2, %gep4
+ ret i1 %cmp
+}
+
+define i1 @gep_gep_multiple_eq_partial_inbounds_different_scales(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
+; CHECK-LABEL: @gep_gep_multiple_eq_partial_inbounds_different_scales(
+; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i64 [[IDX1:%.*]], 2
+; CHECK-NEXT: [[GEP2_IDX:%.*]] = shl nsw i64 [[IDX2:%.*]], 3
+; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[GEP1_IDX]], [[GEP2_IDX]]
+; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl nsw i64 [[IDX3:%.*]], 2
+; CHECK-NEXT: [[GEP4_IDX:%.*]] = shl i64 [[IDX4:%.*]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[GEP3_IDX]], [[GEP4_IDX]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep1 = getelementptr inbounds i32, ptr %base, i64 %idx1
+ %gep2 = getelementptr inbounds i64, ptr %gep1, i64 %idx2
+ %gep3 = getelementptr inbounds i32, ptr %base, i64 %idx3
+ %gep4 = getelementptr i64, ptr %gep3, i64 %idx4
+ %cmp = icmp eq ptr %gep2, %gep4
+ ret i1 %cmp
+}
+
+define i1 @gep_gep_multiple_ult_nuw(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
+; CHECK-LABEL: @gep_gep_multiple_ult_nuw(
+; CHECK-NEXT: [[GEP1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]]
+; CHECK-NEXT: [[GEP3_IDX2:%.*]] = add i64 [[IDX3:%.*]], [[IDX4:%.*]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[GEP1_IDX1]], [[GEP3_IDX2]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep1 = getelementptr nuw i32, ptr %base, i64 %idx1
+ %gep2 = getelementptr nuw i32, ptr %gep1, i64 %idx2
+ %gep3 = getelementptr nuw i32, ptr %base, i64 %idx3
+ %gep4 = getelementptr nuw i32, ptr %gep3, i64 %idx4
+ %cmp = icmp ult ptr %gep2, %gep4
+ ret i1 %cmp
+}
+
+define i1 @gep_gep_multiple_ult_missing_nuw(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
+; CHECK-LABEL: @gep_gep_multiple_ult_missing_nuw(
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr nuw i32, ptr [[BASE:%.*]], i64 [[IDX1:%.*]]
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr nuw i32, ptr [[GEP1]], i64 [[IDX2:%.*]]
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr nuw i32, ptr [[BASE]], i64 [[IDX3:%.*]]
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i32, ptr [[GEP3]], i64 [[IDX4:%.*]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult ptr [[GEP2]], [[GEP4]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep1 = getelementptr nuw i32, ptr %base, i64 %idx1
+ %gep2 = getelementptr nuw i32, ptr %gep1, i64 %idx2
+ %gep3 = getelementptr nuw i32, ptr %base, i64 %idx3
+ %gep4 = getelementptr i32, ptr %gep3, i64 %idx4
+ %cmp = icmp ult ptr %gep2, %gep4
+ ret i1 %cmp
+}
+
+define i1 @gep_gep_multiple_ult_nuw_multi_use(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
+; CHECK-LABEL: @gep_gep_multiple_ult_nuw_multi_use(
+; CHECK-NEXT: [[IDX3:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]]
+; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl nuw i64 [[IDX3]], 2
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr nuw i8, ptr [[BASE:%.*]], i64 [[GEP3_IDX]]
+; CHECK-NEXT: [[IDX4:%.*]] = add i64 [[IDX5:%.*]], [[IDX6:%.*]]
+; CHECK-NEXT: [[GEP4_IDX:%.*]] = shl nuw i64 [[IDX4]], 2
+; CHECK-NEXT: [[GEP5:%.*]] = getelementptr nuw i8, ptr [[BASE]], i64 [[GEP4_IDX]]
+; CHECK-NEXT: call void @use(ptr [[GEP3]])
+; CHECK-NEXT: call void @use(ptr [[GEP5]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[GEP3_IDX]], [[GEP4_IDX]]
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep1 = getelementptr nuw i32, ptr %base, i64 %idx1
+ %gep2 = getelementptr nuw i32, ptr %gep1, i64 %idx2
+ %gep3 = getelementptr nuw i32, ptr %base, i64 %idx3
+ %gep4 = getelementptr nuw i32, ptr %gep3, i64 %idx4
+ call void @use(ptr %gep2)
+ call void @use(ptr %gep4)
+ %cmp = icmp ult ptr %gep2, %gep4
+ ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/malloc-free.ll b/llvm/test/Transforms/InstCombine/malloc-free.ll
index 989074f..d8a1c07 100644
--- a/llvm/test/Transforms/InstCombine/malloc-free.ll
+++ b/llvm/test/Transforms/InstCombine/malloc-free.ll
@@ -109,8 +109,6 @@ define void @test3(ptr %src) {
; CHECK-NEXT: ret void
;
%a = call noalias ptr @malloc(i32 10)
- call void @llvm.lifetime.start.p0(i64 10, ptr %a)
- call void @llvm.lifetime.end.p0(i64 10, ptr %a)
%size = call i64 @llvm.objectsize.i64(ptr %a, i1 true)
store i8 42, ptr %a
call void @llvm.memcpy.p0.p0.i32(ptr %a, ptr %src, i32 32, i1 false)
diff --git a/llvm/test/Transforms/InstCombine/pr150338.ll b/llvm/test/Transforms/InstCombine/pr150338.ll
new file mode 100644
index 0000000..2ad454e
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr150338.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+; Make sure this does not crash.
+define void @test(ptr %arg) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT: store i1 true, ptr poison, align 1
+; CHECK-NEXT: ret void
+;
+ %a = alloca i32
+ store ptr %a, ptr %arg
+ store i1 true, ptr poison
+ call void @llvm.lifetime.end.p0(i64 4, ptr %a)
+ ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll
index 9a0a6ae..95753a2 100644
--- a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll
+++ b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll
@@ -174,16 +174,12 @@ define { <16 x i8>, <32 x i8> } @differenttypes({ <4 x i32>, <8 x i32> } %a, ptr
; CHECK-LABEL: define { <16 x i8>, <32 x i8> } @differenttypes
; CHECK-SAME: ({ <4 x i32>, <8 x i32> } [[A:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[P]])
; CHECK-NEXT: store { <4 x i32>, <8 x i32> } [[A]], ptr [[P]], align 16
; CHECK-NEXT: [[TMP0:%.*]] = load { <16 x i8>, <32 x i8> }, ptr [[P]], align 16
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[P]])
; CHECK-NEXT: ret { <16 x i8>, <32 x i8> } [[TMP0]]
;
entry:
- call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %p) #5
store { <4 x i32>, <8 x i32> } %a, ptr %p, align 16
%2 = load { <16 x i8>, <32 x i8> }, ptr %p, align 16
- call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %p) #5
ret { <16 x i8>, <32 x i8> } %2
}
diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll
index 11af6b4..45e5686 100644
--- a/llvm/test/Transforms/InstCombine/sub-gep.ll
+++ b/llvm/test/Transforms/InstCombine/sub-gep.ll
@@ -945,19 +945,15 @@ define i64 @multiple_geps_two_chains_gep_base(ptr %base, i64 %base.idx, i64 %idx
define i64 @multiple_geps_two_chains_multi_use(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
; CHECK-LABEL: @multiple_geps_two_chains_multi_use(
-; CHECK-NEXT: [[P2_IDX:%.*]] = shl nsw i64 [[IDX2:%.*]], 2
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 [[P2_IDX]]
-; CHECK-NEXT: [[P4_IDX:%.*]] = shl nsw i64 [[IDX4:%.*]], 2
-; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P4_IDX]]
-; CHECK-NEXT: [[P3_IDX:%.*]] = shl nsw i64 [[IDX3:%.*]], 2
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[P3_IDX]]
-; CHECK-NEXT: [[P4_IDX1:%.*]] = shl nsw i64 [[IDX5:%.*]], 2
-; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, ptr [[P3]], i64 [[P4_IDX1]]
+; CHECK-NEXT: [[P1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]]
+; CHECK-NEXT: [[P4_IDX:%.*]] = shl i64 [[P1_IDX1]], 2
+; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 [[P4_IDX]]
+; CHECK-NEXT: [[P3_IDX2:%.*]] = add i64 [[IDX3:%.*]], [[IDX4:%.*]]
+; CHECK-NEXT: [[P4_IDX1:%.*]] = shl i64 [[P3_IDX2]], 2
+; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P4_IDX1]]
; CHECK-NEXT: call void @use(ptr [[P5]])
; CHECK-NEXT: call void @use(ptr [[P4]])
-; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[P2_IDX]], [[P4_IDX]]
-; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[P3_IDX]], [[P4_IDX1]]
-; CHECK-NEXT: [[GEPDIFF:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[GEPDIFF:%.*]] = sub nsw i64 [[P4_IDX]], [[P4_IDX1]]
; CHECK-NEXT: ret i64 [[GEPDIFF]]
;
%p1 = getelementptr inbounds i32, ptr %base, i64 %idx1
@@ -974,23 +970,18 @@ define i64 @multiple_geps_two_chains_multi_use(ptr %base, i64 %idx1, i64 %idx2,
define i64 @multiple_geps_two_chains_partial_multi_use(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4, i64 %idx5, i64 %idx6) {
; CHECK-LABEL: @multiple_geps_two_chains_partial_multi_use(
-; CHECK-NEXT: [[P2_IDX:%.*]] = shl nsw i64 [[IDX2:%.*]], 2
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 [[P2_IDX]]
-; CHECK-NEXT: [[P4_IDX:%.*]] = shl nsw i64 [[IDX4:%.*]], 2
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P4_IDX]]
-; CHECK-NEXT: [[P3_IDX:%.*]] = shl nsw i64 [[IDX3:%.*]], 2
-; CHECK-NEXT: [[P4_IDX1:%.*]] = shl nsw i64 [[IDX7:%.*]], 2
-; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[P4_IDX1]]
-; CHECK-NEXT: [[P5_IDX:%.*]] = shl nsw i64 [[IDX5:%.*]], 2
-; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, ptr [[P5]], i64 [[P5_IDX]]
-; CHECK-NEXT: [[P6_IDX:%.*]] = shl nsw i64 [[IDX6:%.*]], 2
+; CHECK-NEXT: [[P1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]]
+; CHECK-NEXT: [[P4_IDX:%.*]] = shl i64 [[P1_IDX1]], 2
+; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 [[P4_IDX]]
+; CHECK-NEXT: [[P4_IDX2:%.*]] = add i64 [[IDX4:%.*]], [[IDX5:%.*]]
+; CHECK-NEXT: [[P5_IDX:%.*]] = shl i64 [[P4_IDX2]], 2
+; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P5_IDX]]
; CHECK-NEXT: call void @use(ptr [[P3]])
; CHECK-NEXT: call void @use(ptr [[P4]])
-; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[P2_IDX]], [[P4_IDX]]
-; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[TMP1]], [[P3_IDX]]
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[P4_IDX1]], [[P5_IDX]]
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP3]], [[P6_IDX]]
-; CHECK-NEXT: [[GEPDIFF:%.*]] = sub nsw i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[P1_IDX1]], [[IDX3:%.*]]
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[P4_IDX2]], [[IDX6:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[GEPDIFF:%.*]] = shl i64 [[TMP5]], 2
; CHECK-NEXT: ret i64 [[GEPDIFF]]
;
%p1 = getelementptr inbounds i32, ptr %base, i64 %idx1
@@ -1007,6 +998,29 @@ define i64 @multiple_geps_two_chains_partial_multi_use(ptr %base, i64 %idx1, i64
ret i64 %d
}
+define i64 @multiple_geps_two_chains_partial_multi_use_insert_point(ptr %p, i64 %idx1, i64 %idx2, i64 %idx3) {
+; CHECK-LABEL: @multiple_geps_two_chains_partial_multi_use_insert_point(
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 8
+; CHECK-NEXT: call void @use(ptr [[GEP2]])
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[IDX2:%.*]], [[IDX3:%.*]]
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[TMP1]]
+; CHECK-NEXT: call void @use(ptr [[GEP4]])
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 8
+; CHECK-NEXT: [[GEPDIFF:%.*]] = sub i64 [[IDX1:%.*]], [[TMP2]]
+; CHECK-NEXT: ret i64 [[GEPDIFF]]
+;
+ %gep1 = getelementptr i8, ptr %p, i64 %idx1
+ %gep2 = getelementptr i8, ptr %p, i64 8
+ call void @use(ptr %gep2)
+ %gep3 = getelementptr i8, ptr %gep2, i64 %idx2
+ %gep4 = getelementptr i8, ptr %gep3, i64 %idx3
+ call void @use(ptr %gep4)
+ %gep1.int = ptrtoint ptr %gep1 to i64
+ %gep4.int = ptrtoint ptr %gep4 to i64
+ %sub = sub i64 %gep1.int, %gep4.int
+ ret i64 %sub
+}
+
define i64 @multiple_geps_inbounds(ptr %base, i64 %idx, i64 %idx2) {
; CHECK-LABEL: @multiple_geps_inbounds(
; CHECK-NEXT: [[D:%.*]] = add nsw i64 [[IDX:%.*]], [[IDX2:%.*]]
@@ -1158,3 +1172,65 @@ define i64 @nuw_ptrdiff_mul_nsw_nneg_scale_multiuse(ptr %base, i64 %idx) {
%diff = sub nuw i64 %lhs, %rhs
ret i64 %diff
}
+
+define i64 @multiple_geps_multi_use_below_limit(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
+; CHECK-LABEL: @multiple_geps_multi_use_below_limit(
+; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds nuw i8, ptr [[P1:%.*]], i64 [[IDX2:%.*]]
+; CHECK-NEXT: call void @use(ptr [[P2]])
+; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds nuw i8, ptr [[P2]], i64 [[IDX5:%.*]]
+; CHECK-NEXT: call void @use(ptr [[P4]])
+; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds nuw i8, ptr [[P1]], i64 [[IDX3:%.*]]
+; CHECK-NEXT: call void @use(ptr [[P3]])
+; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds nuw i8, ptr [[P3]], i64 [[IDX4:%.*]]
+; CHECK-NEXT: call void @use(ptr [[P5]])
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[IDX2]], [[IDX5]]
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[IDX3]], [[IDX4]]
+; CHECK-NEXT: [[GEPDIFF:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret i64 [[GEPDIFF]]
+;
+ %p1 = getelementptr inbounds nuw i8, ptr %base, i64 %idx1
+ call void @use(ptr %p1)
+ %p2 = getelementptr inbounds nuw i8, ptr %p1, i64 %idx2
+ call void @use(ptr %p2)
+ %p3 = getelementptr inbounds nuw i8, ptr %base, i64 %idx3
+ call void @use(ptr %p3)
+ %p4 = getelementptr inbounds nuw i8, ptr %p3, i64 %idx4
+ call void @use(ptr %p4)
+ %i1 = ptrtoint ptr %p4 to i64
+ %i2 = ptrtoint ptr %p2 to i64
+ %d = sub i64 %i2, %i1
+ ret i64 %d
+}
+
+define i64 @multiple_geps_multi_use_above_limit(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4, i64 %idx5) {
+; CHECK-LABEL: @multiple_geps_multi_use_above_limit(
+; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds nuw i8, ptr [[P1:%.*]], i64 [[IDX2:%.*]]
+; CHECK-NEXT: call void @use(ptr [[P2]])
+; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds nuw i8, ptr [[P2]], i64 [[IDX6:%.*]]
+; CHECK-NEXT: call void @use(ptr [[P3]])
+; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds nuw i8, ptr [[P1]], i64 [[TMP3:%.*]]
+; CHECK-NEXT: call void @use(ptr [[P5]])
+; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds nuw i8, ptr [[P5]], i64 [[IDX7:%.*]]
+; CHECK-NEXT: call void @use(ptr [[P6]])
+; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds nuw i8, ptr [[P6]], i64 [[IDX5:%.*]]
+; CHECK-NEXT: call void @use(ptr [[P7]])
+; CHECK-NEXT: [[I1:%.*]] = ptrtoint ptr [[P7]] to i64
+; CHECK-NEXT: [[I2:%.*]] = ptrtoint ptr [[P3]] to i64
+; CHECK-NEXT: [[D:%.*]] = sub i64 [[I2]], [[I1]]
+; CHECK-NEXT: ret i64 [[D]]
+;
+ %p1 = getelementptr inbounds nuw i8, ptr %base, i64 %idx1
+ call void @use(ptr %p1)
+ %p2 = getelementptr inbounds nuw i8, ptr %p1, i64 %idx2
+ call void @use(ptr %p2)
+ %p3 = getelementptr inbounds nuw i8, ptr %base, i64 %idx3
+ call void @use(ptr %p3)
+ %p4 = getelementptr inbounds nuw i8, ptr %p3, i64 %idx4
+ call void @use(ptr %p4)
+ %p5 = getelementptr inbounds nuw i8, ptr %p4, i64 %idx5
+ call void @use(ptr %p5)
+ %i1 = ptrtoint ptr %p5 to i64
+ %i2 = ptrtoint ptr %p2 to i64
+ %d = sub i64 %i2, %i1
+ ret i64 %d
+}
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll
new file mode 100644
index 0000000..75b8509
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll
@@ -0,0 +1,646 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s
+
+; Test constant-folding for various NVVM unary arithmetic intrinsics.
+
+;###############################################################
+;# Ceil #
+;###############################################################
+
+define double @test_ceil_d_1_25() {
+; CHECK-LABEL: define double @test_ceil_d_1_25() {
+; CHECK-NEXT: ret double 2.000000e+00
+;
+ %res = call double @llvm.nvvm.ceil.d(double 1.25)
+ ret double %res
+}
+
+define float @test_ceil_f_1_25() {
+; CHECK-LABEL: define float @test_ceil_f_1_25() {
+; CHECK-NEXT: ret float 2.000000e+00
+;
+ %res = call float @llvm.nvvm.ceil.f(float 1.25)
+ ret float %res
+}
+
+define float @test_ceil_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_ceil_ftz_f_1_25() {
+; CHECK-NEXT: ret float 2.000000e+00
+;
+ %res = call float @llvm.nvvm.ceil.ftz.f(float 1.25)
+ ret float %res
+}
+
+define double @test_ceil_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_ceil_d_pos_subnorm() {
+; CHECK-NEXT: ret double 1.000000e+00
+;
+ %res = call double @llvm.nvvm.ceil.d(double 0x380FFFFFC0000000)
+ ret double %res
+}
+
+define float @test_ceil_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_ceil_f_pos_subnorm() {
+; CHECK-NEXT: ret float 1.000000e+00
+;
+ %res = call float @llvm.nvvm.ceil.f(float 0x380FFFFFC0000000)
+ ret float %res
+}
+
+define float @test_ceil_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_ceil_ftz_f_pos_subnorm() {
+; CHECK-NEXT: ret float 0.000000e+00
+;
+ %res = call float @llvm.nvvm.ceil.ftz.f(float 0x380FFFFFC0000000)
+ ret float %res
+}
+
+;###############################################################
+;# FAbs #
+;###############################################################
+
+define float @test_fabs_neg_1_5() {
+; CHECK-LABEL: define float @test_fabs_neg_1_5() {
+; CHECK-NEXT: ret float 1.500000e+00
+;
+ %res = call float @llvm.nvvm.fabs(float -1.5)
+ ret float %res
+}
+
+define float @test_fabs_ftz_neg_1_5() {
+; CHECK-LABEL: define float @test_fabs_ftz_neg_1_5() {
+; CHECK-NEXT: ret float 1.500000e+00
+;
+ %res = call float @llvm.nvvm.fabs.ftz(float -1.5)
+ ret float %res
+}
+
+define float @test_fabs_1_25() {
+; CHECK-LABEL: define float @test_fabs_1_25() {
+; CHECK-NEXT: ret float 1.250000e+00
+;
+ %res = call float @llvm.nvvm.fabs(float 1.25)
+ ret float %res
+}
+
+define float @test_fabs_ftz_1_25() {
+; CHECK-LABEL: define float @test_fabs_ftz_1_25() {
+; CHECK-NEXT: ret float 1.250000e+00
+;
+ %res = call float @llvm.nvvm.fabs.ftz(float 1.25)
+ ret float %res
+}
+
+define float @test_fabs_neg_subnorm() {
+; CHECK-LABEL: define float @test_fabs_neg_subnorm() {
+; CHECK-NEXT: ret float 0x380FFFFFC0000000
+;
+ %res = call float @llvm.nvvm.fabs(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+define float @test_fabs_ftz_neg_subnorm() {
+; CHECK-LABEL: define float @test_fabs_ftz_neg_subnorm() {
+; CHECK-NEXT: ret float 0.000000e+00
+;
+ %res = call float @llvm.nvvm.fabs.ftz(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+define float @test_fabs_pos_subnorm() {
+; CHECK-LABEL: define float @test_fabs_pos_subnorm() {
+; CHECK-NEXT: ret float 0x380FFFFFC0000000
+;
+ %res = call float @llvm.nvvm.fabs(float 0x380FFFFFC0000000)
+ ret float %res
+}
+
+define float @test_fabs_ftz_pos_subnorm() {
+; CHECK-LABEL: define float @test_fabs_ftz_pos_subnorm() {
+; CHECK-NEXT: ret float 0.000000e+00
+;
+ %res = call float @llvm.nvvm.fabs.ftz(float 0x380FFFFFC0000000)
+ ret float %res
+}
+
+
+;###############################################################
+;# Floor #
+;###############################################################
+
+define double @test_floor_d_1_25() {
+; CHECK-LABEL: define double @test_floor_d_1_25() {
+; CHECK-NEXT: ret double 1.000000e+00
+;
+ %res = call double @llvm.nvvm.floor.d(double 1.25)
+ ret double %res
+}
+
+define float @test_floor_f_1_25() {
+; CHECK-LABEL: define float @test_floor_f_1_25() {
+; CHECK-NEXT: ret float 1.000000e+00
+;
+ %res = call float @llvm.nvvm.floor.f(float 1.25)
+ ret float %res
+}
+
+define float @test_floor_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_floor_ftz_f_1_25() {
+; CHECK-NEXT: ret float 1.000000e+00
+;
+ %res = call float @llvm.nvvm.floor.ftz.f(float 1.25)
+ ret float %res
+}
+
+define double @test_floor_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_floor_d_neg_subnorm() {
+; CHECK-NEXT: ret double -1.000000e+00
+;
+ %res = call double @llvm.nvvm.floor.d(double 0xB80FFFFFC0000000)
+ ret double %res
+}
+
+define float @test_floor_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_floor_f_neg_subnorm() {
+; CHECK-NEXT: ret float -1.000000e+00
+;
+ %res = call float @llvm.nvvm.floor.f(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+define float @test_floor_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_floor_ftz_f_neg_subnorm() {
+; CHECK-NEXT: ret float -0.000000e+00
+;
+ %res = call float @llvm.nvvm.floor.ftz.f(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+;###############################################################
+;# Rcp #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;| rcp_rm |
+;+-------------------------------------------------------------+
+define double @test_rcp_rm_d_0_5() {
+; CHECK-LABEL: define double @test_rcp_rm_d_0_5() {
+; CHECK-NEXT: ret double 2.000000e+00
+;
+ %res = call double @llvm.nvvm.rcp.rm.d(double 0.5)
+ ret double %res
+}
+
+define float @test_rcp_rm_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rm_f_0_5() {
+; CHECK-NEXT: ret float 2.000000e+00
+;
+ %res = call float @llvm.nvvm.rcp.rm.f(float 0.5)
+ ret float %res
+}
+
+define float @test_rcp_rm_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rm_ftz_f_0_5() {
+; CHECK-NEXT: ret float 2.000000e+00
+;
+ %res = call float @llvm.nvvm.rcp.rm.ftz.f(float 0.5)
+ ret float %res
+}
+
+define double @test_rcp_rm_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_rcp_rm_d_neg_subnorm() {
+; CHECK-NEXT: ret double 0xC7D0000020000041
+;
+ %res = call double @llvm.nvvm.rcp.rm.d(double 0xB80FFFFFC0000000)
+ ret double %res
+}
+
+define float @test_rcp_rm_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rm_f_neg_subnorm() {
+; CHECK-NEXT: ret float 0xC7D0000040000000
+;
+ %res = call float @llvm.nvvm.rcp.rm.f(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+define float @test_rcp_rm_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rm_ftz_f_neg_subnorm() {
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.nvvm.rcp.rm.ftz.f(float 0xB80FFFFFC0000000)
+; CHECK-NEXT: ret float [[RES]]
+;
+ %res = call float @llvm.nvvm.rcp.rm.ftz.f(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+;+-------------------------------------------------------------+
+;| rcp_rn |
+;+-------------------------------------------------------------+
+define double @test_rcp_rn_d_0_5() {
+; CHECK-LABEL: define double @test_rcp_rn_d_0_5() {
+; CHECK-NEXT: ret double 2.000000e+00
+;
+ %res = call double @llvm.nvvm.rcp.rn.d(double 0.5)
+ ret double %res
+}
+
+define float @test_rcp_rn_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rn_f_0_5() {
+; CHECK-NEXT: ret float 2.000000e+00
+;
+ %res = call float @llvm.nvvm.rcp.rn.f(float 0.5)
+ ret float %res
+}
+
+define float @test_rcp_rn_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rn_ftz_f_0_5() {
+; CHECK-NEXT: ret float 2.000000e+00
+;
+ %res = call float @llvm.nvvm.rcp.rn.ftz.f(float 0.5)
+ ret float %res
+}
+
+define double @test_rcp_rn_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_rcp_rn_d_neg_subnorm() {
+; CHECK-NEXT: ret double 0xC7D0000020000040
+;
+ %res = call double @llvm.nvvm.rcp.rn.d(double 0xB80FFFFFC0000000)
+ ret double %res
+}
+
+define float @test_rcp_rn_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rn_f_neg_subnorm() {
+; CHECK-NEXT: ret float 0xC7D0000020000000
+;
+ %res = call float @llvm.nvvm.rcp.rn.f(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+define float @test_rcp_rn_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rn_ftz_f_neg_subnorm() {
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.nvvm.rcp.rn.ftz.f(float 0xB80FFFFFC0000000)
+; CHECK-NEXT: ret float [[RES]]
+;
+ %res = call float @llvm.nvvm.rcp.rn.ftz.f(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+;+-------------------------------------------------------------+
+;| rcp_rp |
+;+-------------------------------------------------------------+
+define double @test_rcp_rp_d_0_5() {
+; CHECK-LABEL: define double @test_rcp_rp_d_0_5() {
+; CHECK-NEXT: ret double 2.000000e+00
+;
+ %res = call double @llvm.nvvm.rcp.rp.d(double 0.5)
+ ret double %res
+}
+
+define float @test_rcp_rp_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rp_f_0_5() {
+; CHECK-NEXT: ret float 2.000000e+00
+;
+ %res = call float @llvm.nvvm.rcp.rp.f(float 0.5)
+ ret float %res
+}
+
+define float @test_rcp_rp_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rp_ftz_f_0_5() {
+; CHECK-NEXT: ret float 2.000000e+00
+;
+ %res = call float @llvm.nvvm.rcp.rp.ftz.f(float 0.5)
+ ret float %res
+}
+
+define double @test_rcp_rp_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_rcp_rp_d_neg_subnorm() {
+; CHECK-NEXT: ret double 0xC7D0000020000040
+;
+ %res = call double @llvm.nvvm.rcp.rp.d(double 0xB80FFFFFC0000000)
+ ret double %res
+}
+
+define float @test_rcp_rp_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rp_f_neg_subnorm() {
+; CHECK-NEXT: ret float 0xC7D0000020000000
+;
+ %res = call float @llvm.nvvm.rcp.rp.f(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+define float @test_rcp_rp_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rp_ftz_f_neg_subnorm() {
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.nvvm.rcp.rp.ftz.f(float 0xB80FFFFFC0000000)
+; CHECK-NEXT: ret float [[RES]]
+;
+ %res = call float @llvm.nvvm.rcp.rp.ftz.f(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+;+-------------------------------------------------------------+
+;| rcp_rz |
+;+-------------------------------------------------------------+
+define double @test_rcp_rz_d_0_5() {
+; CHECK-LABEL: define double @test_rcp_rz_d_0_5() {
+; CHECK-NEXT: ret double 2.000000e+00
+;
+ %res = call double @llvm.nvvm.rcp.rz.d(double 0.5)
+ ret double %res
+}
+
+define float @test_rcp_rz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rz_f_0_5() {
+; CHECK-NEXT: ret float 2.000000e+00
+;
+ %res = call float @llvm.nvvm.rcp.rz.f(float 0.5)
+ ret float %res
+}
+
+define float @test_rcp_rz_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rz_ftz_f_0_5() {
+; CHECK-NEXT: ret float 2.000000e+00
+;
+ %res = call float @llvm.nvvm.rcp.rz.ftz.f(float 0.5)
+ ret float %res
+}
+
+define double @test_rcp_rz_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_rcp_rz_d_neg_subnorm() {
+; CHECK-NEXT: ret double 0xC7D0000020000040
+;
+ %res = call double @llvm.nvvm.rcp.rz.d(double 0xB80FFFFFC0000000)
+ ret double %res
+}
+
+define float @test_rcp_rz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rz_f_neg_subnorm() {
+; CHECK-NEXT: ret float 0xC7D0000020000000
+;
+ %res = call float @llvm.nvvm.rcp.rz.f(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+define float @test_rcp_rz_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rz_ftz_f_neg_subnorm() {
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.nvvm.rcp.rz.ftz.f(float 0xB80FFFFFC0000000)
+; CHECK-NEXT: ret float [[RES]]
+;
+ %res = call float @llvm.nvvm.rcp.rz.ftz.f(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+;###############################################################
+;# Round #
+;###############################################################
+
+define double @test_round_d_neg_1_5() {
+; CHECK-LABEL: define double @test_round_d_neg_1_5() {
+; CHECK-NEXT: ret double -2.000000e+00
+;
+ %res = call double @llvm.nvvm.round.d(double -1.5)
+ ret double %res
+}
+
+define float @test_round_f_neg_1_5() {
+; CHECK-LABEL: define float @test_round_f_neg_1_5() {
+; CHECK-NEXT: ret float -2.000000e+00
+;
+ %res = call float @llvm.nvvm.round.f(float -1.5)
+ ret float %res
+}
+
+define float @test_round_ftz_f_neg_1_5() {
+; CHECK-LABEL: define float @test_round_ftz_f_neg_1_5() {
+; CHECK-NEXT: ret float -2.000000e+00
+;
+ %res = call float @llvm.nvvm.round.ftz.f(float -1.5)
+ ret float %res
+}
+
+define double @test_round_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_round_d_neg_subnorm() {
+; CHECK-NEXT: ret double -0.000000e+00
+;
+ %res = call double @llvm.nvvm.round.d(double 0xB80FFFFFC0000000)
+ ret double %res
+}
+
+define float @test_round_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_round_f_neg_subnorm() {
+; CHECK-NEXT: ret float -0.000000e+00
+;
+ %res = call float @llvm.nvvm.round.f(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+define float @test_round_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_round_ftz_f_neg_subnorm() {
+; CHECK-NEXT: ret float -0.000000e+00
+;
+ %res = call float @llvm.nvvm.round.ftz.f(float 0xB80FFFFFC0000000)
+ ret float %res
+}
+
+;###############################################################
+;# Saturate #
+;###############################################################
+
+define double @test_saturate_d_1_25() {
+; CHECK-LABEL: define double @test_saturate_d_1_25() {
+; CHECK-NEXT: ret double 1.000000e+00
+;
+ %res = call double @llvm.nvvm.saturate.d(double 1.25)
+ ret double %res
+}
+
+define float @test_saturate_f_1_25() {
+; CHECK-LABEL: define float @test_saturate_f_1_25() {
+; CHECK-NEXT: ret float 1.000000e+00
+;
+ %res = call float @llvm.nvvm.saturate.f(float 1.25)
+ ret float %res
+}
+
+define float @test_saturate_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_saturate_ftz_f_1_25() {
+; CHECK-NEXT: ret float 1.000000e+00
+;
+ %res = call float @llvm.nvvm.saturate.ftz.f(float 1.25)
+ ret float %res
+}
+
+define double @test_saturate_d_neg_1_25() {
+; CHECK-LABEL: define double @test_saturate_d_neg_1_25() {
+; CHECK-NEXT: ret double 0.000000e+00
+;
+ %res = call double @llvm.nvvm.saturate.d(double -1.25)
+ ret double %res
+}
+
+define float @test_saturate_f_neg_1_25() {
+; CHECK-LABEL: define float @test_saturate_f_neg_1_25() {
+; CHECK-NEXT: ret float 0.000000e+00
+;
+ %res = call float @llvm.nvvm.saturate.f(float -1.25)
+ ret float %res
+}
+
+define float @test_saturate_ftz_f_neg_1_25() {
+; CHECK-LABEL: define float @test_saturate_ftz_f_neg_1_25() {
+; CHECK-NEXT: ret float 0.000000e+00
+;
+ %res = call float @llvm.nvvm.saturate.ftz.f(float -1.25)
+ ret float %res
+}
+
+define double @test_saturate_d_0_5() {
+; CHECK-LABEL: define double @test_saturate_d_0_5() {
+; CHECK-NEXT: ret double 5.000000e-01
+;
+ %res = call double @llvm.nvvm.saturate.d(double 0.5)
+ ret double %res
+}
+
+define float @test_saturate_f_0_5() {
+; CHECK-LABEL: define float @test_saturate_f_0_5() {
+; CHECK-NEXT: ret float 5.000000e-01
+;
+ %res = call float @llvm.nvvm.saturate.f(float 0.5)
+ ret float %res
+}
+
+define float @test_saturate_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_saturate_ftz_f_0_5() {
+; CHECK-NEXT: ret float 5.000000e-01
+;
+ %res = call float @llvm.nvvm.saturate.ftz.f(float 0.5)
+ ret float %res
+}
+
+define double @test_saturate_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_saturate_d_pos_subnorm() {
+; CHECK-NEXT: ret double 0x380FFFFFC0000000
+;
+ %res = call double @llvm.nvvm.saturate.d(double 0x380FFFFFC0000000)
+ ret double %res
+}
+
+define float @test_saturate_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_saturate_f_pos_subnorm() {
+; CHECK-NEXT: ret float 0x380FFFFFC0000000
+;
+ %res = call float @llvm.nvvm.saturate.f(float 0x380FFFFFC0000000)
+ ret float %res
+}
+
+define float @test_saturate_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_saturate_ftz_f_pos_subnorm() {
+; CHECK-NEXT: ret float 0.000000e+00
+;
+ %res = call float @llvm.nvvm.saturate.ftz.f(float 0x380FFFFFC0000000)
+ ret float %res
+}
+
+;###############################################################
+;# Sqrt #
+;###############################################################
+
+define float @test_sqrt_f_4() {
+; CHECK-LABEL: define float @test_sqrt_f_4() {
+; CHECK-NEXT: ret float 2.000000e+00
+;
+ %res = call float @llvm.nvvm.sqrt.f(float 4.0)
+ ret float %res
+}
+
+define float @test_sqrt_rn_f_4() {
+; CHECK-LABEL: define float @test_sqrt_rn_f_4() {
+; CHECK-NEXT: ret float 2.000000e+00
+;
+ %res = call float @llvm.nvvm.sqrt.rn.f(float 4.0)
+ ret float %res
+}
+
+define double @test_sqrt_rn_d_4() {
+; CHECK-LABEL: define double @test_sqrt_rn_d_4() {
+; CHECK-NEXT: ret double 2.000000e+00
+;
+ %res = call double @llvm.nvvm.sqrt.rn.d(double 4.0)
+ ret double %res
+}
+
+define float @test_sqrt_rn_ftz_f_4() {
+; CHECK-LABEL: define float @test_sqrt_rn_ftz_f_4() {
+; CHECK-NEXT: ret float 2.000000e+00
+;
+ %res = call float @llvm.nvvm.sqrt.rn.ftz.f(float 4.0)
+ ret float %res
+}
+
+define float @test_sqrt_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_sqrt_f_pos_subnorm() {
+; CHECK-NEXT: ret float 0x3BFFFFFFE0000000
+;
+ %res = call float @llvm.nvvm.sqrt.f(float 0x380FFFFFC0000000)
+ ret float %res
+}
+
+define float @test_sqrt_rn_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_sqrt_rn_f_pos_subnorm() {
+; CHECK-NEXT: ret float 0x3BFFFFFFE0000000
+;
+ %res = call float @llvm.nvvm.sqrt.rn.f(float 0x380FFFFFC0000000)
+ ret float %res
+}
+
+define double @test_sqrt_rn_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_sqrt_rn_d_pos_subnorm() {
+; CHECK-NEXT: ret double 0x3BFFFFFFDFFFFFF0
+;
+ %res = call double @llvm.nvvm.sqrt.rn.d(double 0x380FFFFFC0000000)
+ ret double %res
+}
+
+define float @test_sqrt_rn_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_sqrt_rn_ftz_f_pos_subnorm() {
+; CHECK-NEXT: ret float 0.000000e+00
+;
+ %res = call float @llvm.nvvm.sqrt.rn.ftz.f(float 0x380FFFFFC0000000)
+ ret float %res
+}
+
+declare double @llvm.nvvm.ceil.d(double)
+declare float @llvm.nvvm.ceil.f(float)
+declare float @llvm.nvvm.ceil.ftz.f(float)
+
+declare float @llvm.nvvm.fabs(float)
+declare float @llvm.nvvm.fabs.ftz(float)
+
+declare double @llvm.nvvm.floor.d(double)
+declare float @llvm.nvvm.floor.f(float)
+declare float @llvm.nvvm.floor.ftz.f(float)
+
+declare double @llvm.nvvm.rcp.rm.d(double)
+declare float @llvm.nvvm.rcp.rm.f(float)
+declare float @llvm.nvvm.rcp.rm.ftz.f(float)
+declare double @llvm.nvvm.rcp.rn.d(double)
+declare float @llvm.nvvm.rcp.rn.f(float)
+declare float @llvm.nvvm.rcp.rn.ftz.f(float)
+declare double @llvm.nvvm.rcp.rp.d(double)
+declare float @llvm.nvvm.rcp.rp.f(float)
+declare float @llvm.nvvm.rcp.rp.ftz.f(float)
+declare double @llvm.nvvm.rcp.rz.d(double)
+declare float @llvm.nvvm.rcp.rz.f(float)
+declare float @llvm.nvvm.rcp.rz.ftz.f(float)
+
+declare double @llvm.nvvm.round.d(double)
+declare float @llvm.nvvm.round.f(float)
+declare float @llvm.nvvm.round.ftz.f(float)
+
+declare double @llvm.nvvm.saturate.d(double)
+declare float @llvm.nvvm.saturate.f(float)
+declare float @llvm.nvvm.saturate.ftz.f(float)
+
+declare float @llvm.nvvm.sqrt.f(float)
+declare double @llvm.nvvm.sqrt.rn.d(double)
+declare float @llvm.nvvm.sqrt.rn.f(float)
+declare float @llvm.nvvm.sqrt.rn.ftz.f(float)
diff --git a/llvm/test/Transforms/InstSimplify/exp10.ll b/llvm/test/Transforms/InstSimplify/exp10.ll
index c415c41..17c0811 100644
--- a/llvm/test/Transforms/InstSimplify/exp10.ll
+++ b/llvm/test/Transforms/InstSimplify/exp10.ll
@@ -57,8 +57,7 @@ define <vscale x 2 x float> @exp10_exp10_scalable_vector(<vscale x 2 x float> %x
define float @exp10_poison() {
; CHECK-LABEL: define float @exp10_poison() {
-; CHECK-NEXT: [[RET:%.*]] = call float @llvm.exp10.f32(float poison)
-; CHECK-NEXT: ret float [[RET]]
+; CHECK-NEXT: ret float poison
;
%ret = call float @llvm.exp10.f32(float poison)
ret float %ret
@@ -66,8 +65,7 @@ define float @exp10_poison() {
define <2 x float> @exp10_poison_vector() {
; CHECK-LABEL: define <2 x float> @exp10_poison_vector() {
-; CHECK-NEXT: [[RET:%.*]] = call <2 x float> @llvm.exp10.v2f32(<2 x float> poison)
-; CHECK-NEXT: ret <2 x float> [[RET]]
+; CHECK-NEXT: ret <2 x float> poison
;
%ret = call <2 x float> @llvm.exp10.v2f32(<2 x float> poison)
ret <2 x float> %ret
@@ -75,8 +73,7 @@ define <2 x float> @exp10_poison_vector() {
define <vscale x 2 x float> @exp10_poison_scaleable_vector() {
; CHECK-LABEL: define <vscale x 2 x float> @exp10_poison_scaleable_vector() {
-; CHECK-NEXT: [[RET:%.*]] = call <vscale x 2 x float> @llvm.exp10.nxv2f32(<vscale x 2 x float> poison)
-; CHECK-NEXT: ret <vscale x 2 x float> [[RET]]
+; CHECK-NEXT: ret <vscale x 2 x float> poison
;
%ret = call <vscale x 2 x float> @llvm.exp10.nxv2f32(<vscale x 2 x float> poison)
ret <vscale x 2 x float> %ret
diff --git a/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll b/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll
index e4cfa46..45f5e37 100644
--- a/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll
+++ b/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll
@@ -286,3 +286,327 @@ define void @tanh_poison(ptr %P) {
ret void
}
+
+
+define void @exp_poison(ptr %P) {
+; CHECK-LABEL: @exp_poison(
+; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT: ret void
+;
+ %exp_f32 = call float @llvm.exp(float poison)
+ store volatile float %exp_f32, ptr %P
+
+ %exp_2xf32 = call <2 x float> @llvm.exp(<2 x float> poison)
+ store volatile <2 x float> %exp_2xf32, ptr %P
+
+ %exp_4xf64 = call <4 x double> @llvm.exp(<4 x double> poison)
+ store volatile <4 x double> %exp_4xf64, ptr %P
+
+ %exp2_f32 = call float @llvm.exp2(float poison)
+ store volatile float %exp2_f32, ptr %P
+
+ %exp2_2xf32 = call <2 x float> @llvm.exp2(<2 x float> poison)
+ store volatile <2 x float> %exp2_2xf32, ptr %P
+
+ %exp2_4xf64 = call <4 x double> @llvm.exp2(<4 x double> poison)
+ store volatile <4 x double> %exp2_4xf64, ptr %P
+
+ %exp10_f32 = call float @llvm.exp10(float poison)
+ store volatile float %exp10_f32, ptr %P
+
+ %exp10_2xf32 = call <2 x float> @llvm.exp10(<2 x float> poison)
+ store volatile <2 x float> %exp10_2xf32, ptr %P
+
+ %exp10_4xf64 = call <4 x double> @llvm.exp10(<4 x double> poison)
+ store volatile <4 x double> %exp10_4xf64, ptr %P
+ ret void
+}
+
+
+define void @log_poison(ptr %P) {
+; CHECK-LABEL: @log_poison(
+; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT: ret void
+;
+ %log_f32 = call float @llvm.log(float poison)
+ store volatile float %log_f32, ptr %P
+
+ %log_2xf32 = call <2 x float> @llvm.log(<2 x float> poison)
+ store volatile <2 x float> %log_2xf32, ptr %P
+
+ %log_4xf64 = call <4 x double> @llvm.log(<4 x double> poison)
+ store volatile <4 x double> %log_4xf64, ptr %P
+
+ %log2_f32 = call float @llvm.log2(float poison)
+ store volatile float %log2_f32, ptr %P
+
+ %log2_2xf32 = call <2 x float> @llvm.log2(<2 x float> poison)
+ store volatile <2 x float> %log2_2xf32, ptr %P
+
+ %log2_4xf64 = call <4 x double> @llvm.log2(<4 x double> poison)
+ store volatile <4 x double> %log2_4xf64, ptr %P
+
+ %log10_f32 = call float @llvm.log10(float poison)
+ store volatile float %log10_f32, ptr %P
+
+ %log10_2xf32 = call <2 x float> @llvm.log10(<2 x float> poison)
+ store volatile <2 x float> %log10_2xf32, ptr %P
+
+ %log10_4xf64 = call <4 x double> @llvm.log10(<4 x double> poison)
+ store volatile <4 x double> %log10_4xf64, ptr %P
+ ret void
+}
+
+
+define void @modf_poison(ptr %P) {
+; CHECK-LABEL: @modf_poison(
+; CHECK-NEXT: store volatile { float, float } poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT: store volatile { <2 x float>, <2 x float> } poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile { <4 x double>, <4 x double> } poison, ptr [[P]], align 32
+; CHECK-NEXT: ret void
+;
+ %modf_f32 = call { float, float } @llvm.modf(float poison)
+ store volatile { float, float } %modf_f32, ptr %P
+
+ %modf_2xf32 = call { <2 x float>, <2 x float> } @llvm.modf(<2 x float> poison)
+ store volatile { <2 x float>, <2 x float> } %modf_2xf32, ptr %P
+
+ %modf_4xf64 = call { <4 x double>, <4 x double> } @llvm.modf(<4 x double> poison)
+ store volatile { <4 x double>, <4 x double> } %modf_4xf64, ptr %P
+
+ ret void
+}
+
+
+define void @floor_poison(ptr %P) {
+; CHECK-LABEL: @floor_poison(
+; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT: ret void
+;
+ %floor_f32 = call float @llvm.floor(float poison)
+ store volatile float %floor_f32, ptr %P
+
+ %floor_2xf32 = call <2 x float> @llvm.floor(<2 x float> poison)
+ store volatile <2 x float> %floor_2xf32, ptr %P
+
+ %floor_4xf64 = call <4 x double> @llvm.floor(<4 x double> poison)
+ store volatile <4 x double> %floor_4xf64, ptr %P
+
+ ret void
+}
+
+
+define void @ceil_poison(ptr %P) {
+; CHECK-LABEL: @ceil_poison(
+; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT: ret void
+;
+ %ceil_f32 = call float @llvm.ceil(float poison)
+ store volatile float %ceil_f32, ptr %P
+
+ %ceil_2xf32 = call <2 x float> @llvm.ceil(<2 x float> poison)
+ store volatile <2 x float> %ceil_2xf32, ptr %P
+
+ %ceil_4xf64 = call <4 x double> @llvm.ceil(<4 x double> poison)
+ store volatile <4 x double> %ceil_4xf64, ptr %P
+
+ ret void
+}
+
+
+define void @trunc_poison(ptr %P) {
+; CHECK-LABEL: @trunc_poison(
+; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT: ret void
+;
+ %trunc_f32 = call float @llvm.trunc(float poison)
+ store volatile float %trunc_f32, ptr %P
+
+ %trunc_2xf32 = call <2 x float> @llvm.trunc(<2 x float> poison)
+ store volatile <2 x float> %trunc_2xf32, ptr %P
+
+ %trunc_4xf64 = call <4 x double> @llvm.trunc(<4 x double> poison)
+ store volatile <4 x double> %trunc_4xf64, ptr %P
+
+ ret void
+}
+
+define void @rint_poison(ptr %P) {
+; CHECK-LABEL: @rint_poison(
+; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT: ret void
+;
+ %rint_f32 = call float @llvm.rint(float poison)
+ store volatile float %rint_f32, ptr %P
+
+ %rint_2xf32 = call <2 x float> @llvm.rint(<2 x float> poison)
+ store volatile <2 x float> %rint_2xf32, ptr %P
+
+ %rint_4xf64 = call <4 x double> @llvm.rint(<4 x double> poison)
+ store volatile <4 x double> %rint_4xf64, ptr %P
+
+ ret void
+}
+
+define void @nearbyint_poison(ptr %P) {
+; CHECK-LABEL: @nearbyint_poison(
+; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT: ret void
+;
+ %nearbyint_f32 = call float @llvm.nearbyint(float poison)
+ store volatile float %nearbyint_f32, ptr %P
+
+ %nearbyint_2xf32 = call <2 x float> @llvm.nearbyint(<2 x float> poison)
+ store volatile <2 x float> %nearbyint_2xf32, ptr %P
+
+ %nearbyint_4xf64 = call <4 x double> @llvm.nearbyint(<4 x double> poison)
+ store volatile <4 x double> %nearbyint_4xf64, ptr %P
+
+ ret void
+}
+
+
+define void @round_poison(ptr %P) {
+; CHECK-LABEL: @round_poison(
+; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT: ret void
+;
+ %round_f32 = call float @llvm.round(float poison)
+ store volatile float %round_f32, ptr %P
+
+ %round_2xf32 = call <2 x float> @llvm.round(<2 x float> poison)
+ store volatile <2 x float> %round_2xf32, ptr %P
+
+ %round_4xf64 = call <4 x double> @llvm.round(<4 x double> poison)
+ store volatile <4 x double> %round_4xf64, ptr %P
+
+ ret void
+}
+
+
+define void @roundeven_poison(ptr %P) {
+; CHECK-LABEL: @roundeven_poison(
+; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT: ret void
+;
+ %roundeven_f32 = call float @llvm.roundeven(float poison)
+ store volatile float %roundeven_f32, ptr %P
+
+ %roundeven_2xf32 = call <2 x float> @llvm.roundeven(<2 x float> poison)
+ store volatile <2 x float> %roundeven_2xf32, ptr %P
+
+ %roundeven_4xf64 = call <4 x double> @llvm.roundeven(<4 x double> poison)
+ store volatile <4 x double> %roundeven_4xf64, ptr %P
+
+ ret void
+}
+
+
+define void @lrint_poison(ptr %P) {
+; CHECK-LABEL: @lrint_poison(
+; CHECK-NEXT: store volatile i32 poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT: store volatile <2 x i32> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x i64> poison, ptr [[P]], align 32
+; CHECK-NEXT: ret void
+;
+ %lrint_f32 = call i32 @llvm.lrint(float poison)
+ store volatile i32 %lrint_f32, ptr %P
+
+ %lrint_2xf32 = call <2 x i32> @llvm.lrint(<2 x float> poison)
+ store volatile <2 x i32> %lrint_2xf32, ptr %P
+
+ %lrint_4xf64 = call <4 x i64> @llvm.lrint(<4 x double> poison)
+ store volatile <4 x i64> %lrint_4xf64, ptr %P
+
+ ret void
+}
+
+
+define void @llrint_poison(ptr %P) {
+; CHECK-LABEL: @llrint_poison(
+; CHECK-NEXT: store volatile i32 poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT: store volatile <2 x i32> poison, ptr [[P]], align 8
+; CHECK-NEXT: store volatile <4 x i64> poison, ptr [[P]], align 32
+; CHECK-NEXT: ret void
+;
+ %llrint_f32 = call i32 @llvm.llrint(float poison)
+ store volatile i32 %llrint_f32, ptr %P
+
+ %llrint_2xf32 = call <2 x i32> @llvm.llrint(<2 x float> poison)
+ store volatile <2 x i32> %llrint_2xf32, ptr %P
+
+ %llrint_4xf64 = call <4 x i64> @llvm.llrint(<4 x double> poison)
+ store volatile <4 x i64> %llrint_4xf64, ptr %P
+
+ ret void
+}
+
+
+define void @umul_fix_poison(ptr %P) {
+; CHECK-LABEL: @umul_fix_poison(
+; CHECK-NEXT: store volatile i16 poison, ptr [[P:%.*]], align 2
+; CHECK-NEXT: store volatile i32 poison, ptr [[P]], align 4
+; CHECK-NEXT: store volatile <4 x i32> poison, ptr [[P]], align 16
+; CHECK-NEXT: ret void
+;
+ %umul_fix_i16 = call i16 @llvm.umul.fix(i16 poison, i16 poison, i32 2)
+ store volatile i16 %umul_fix_i16, ptr %P
+
+ %umul_fix_i32 = call i32 @llvm.umul.fix(i32 poison, i32 poison, i32 2)
+ store volatile i32 %umul_fix_i32, ptr %P
+
+ %umul_fix_4xi32 = call <4 x i32> @llvm.umul.fix(<4 x i32> poison, <4 x i32> poison, i32 2)
+ store volatile <4 x i32> %umul_fix_4xi32, ptr %P
+
+ ret void
+}
+
+
+define void @umul_fix_sat_poison(ptr %P) {
+; CHECK-LABEL: @umul_fix_sat_poison(
+; CHECK-NEXT: store volatile i16 poison, ptr [[P:%.*]], align 2
+; CHECK-NEXT: store volatile i32 poison, ptr [[P]], align 4
+; CHECK-NEXT: store volatile <4 x i32> poison, ptr [[P]], align 16
+; CHECK-NEXT: ret void
+;
+ %umul_fix_sati16 = call i16 @llvm.umul.fix.sat(i16 poison, i16 poison, i32 2)
+ store volatile i16 %umul_fix_sati16, ptr %P
+
+ %umul_fix_sati32 = call i32 @llvm.umul.fix.sat(i32 poison, i32 poison, i32 2)
+ store volatile i32 %umul_fix_sati32, ptr %P
+
+ %umul_fix_sat4xi32 = call <4 x i32> @llvm.umul.fix.sat(<4 x i32> poison, <4 x i32> poison, i32 2)
+ store volatile <4 x i32> %umul_fix_sat4xi32, ptr %P
+
+ ret void
+}
diff --git a/llvm/test/Transforms/LICM/PR116813-memoryssa-outdated.ll b/llvm/test/Transforms/LICM/PR116813-memoryssa-outdated.ll
index a040c3c..5627014 100644
--- a/llvm/test/Transforms/LICM/PR116813-memoryssa-outdated.ll
+++ b/llvm/test/Transforms/LICM/PR116813-memoryssa-outdated.ll
@@ -18,7 +18,7 @@ define i32 @foo(i1 %arg, ptr %arg1) {
; CHECK: [[BB1]]:
; CHECK-NEXT: [[UNSWITCHED_SELECT_US:%.*]] = phi ptr [ [[ARG1]], %[[BB0]] ]
; CHECK-NEXT: [[I3_US:%.*]] = call i32 [[UNSWITCHED_SELECT_US]]()
-; CHECK-NEXT: br i1 true, label %[[LOOP_US]], label %[[RET_SPLIT_US:.*]]
+; CHECK-NEXT: br i1 true, label %[[LOOP_US]], label %[[RET_SPLIT_US:.*]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[RET_SPLIT_US]]:
; CHECK-NEXT: [[I3_LCSSA_US:%.*]] = phi i32 [ [[I3_US]], %[[BB1]] ]
; CHECK-NEXT: br label %[[RET:.*]]
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll b/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll
new file mode 100644
index 0000000..dd524ab
--- /dev/null
+++ b/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
+
+define void @test(ptr %addr) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[ADDR:%.*]]) {
+; CHECK-NEXT: indirectbr ptr [[ADDR]], [label %[[A:.*]], label %C]
+; CHECK: [[A]]:
+; CHECK-NEXT: br i1 true, label %[[B:.*]], label %[[C_LOOPEXIT:.*]]
+; CHECK: [[B]]:
+; CHECK-NEXT: br i1 true, label %[[A]], label %[[C_LOOPEXIT]]
+; CHECK: [[C_LOOPEXIT]]:
+; CHECK-NEXT: br label %[[C:.*]]
+; CHECK: [[C]]:
+; CHECK-NEXT: unreachable
+;
+
+ indirectbr ptr %addr, [label %A, label %C]
+
+A:
+ br i1 true, label %B, label %C
+
+B:
+ br i1 true, label %A, label %C
+
+C:
+ unreachable
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
index 1f61989..812bca9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
@@ -46,27 +46,17 @@ define void @_Z3foov() {
; CHECK-V2-IC4-LABEL: define void @_Z3foov(
; CHECK-V2-IC4-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY1:.*:]]
-; CHECK-V2-IC4: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0:![0-9]+]]
-; CHECK-V2-IC4: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
-; CHECK-V2-IC4: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V2-IC4: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
; CHECK-V2-IC4: [[VECTOR_PH]]:
; CHECK-V2-IC4: br label %[[VECTOR_BODY:.*]]
; CHECK-V2-IC4: [[VECTOR_BODY]]:
-; CHECK-V2-IC4: br i1 [[TMP12:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-V2-IC4: br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK-V2-IC4: [[MIDDLE_BLOCK]]:
-; CHECK-V2-IC4: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]]
-; CHECK-V2-IC4: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-V2-IC4: br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6:![0-9]+]]
-; CHECK-V2-IC4: [[VEC_EPILOG_PH]]:
-; CHECK-V2-IC4: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
-; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-V2-IC4: br i1 [[TMP23:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-V2-IC4: [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-V2-IC4: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8:![0-9]+]]
-; CHECK-V2-IC4: [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V2-IC4: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-V2-IC4: [[SCALAR_PH]]:
; CHECK-V2-IC4: br label %[[FOR_BODY:.*]]
; CHECK-V2-IC4: [[FOR_BODY]]:
-; CHECK-V2-IC4: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-V2-IC4: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-V2-IC4: [[FOR_COND_CLEANUP]]:
;
entry:
@@ -111,9 +101,6 @@ for.cond.cleanup: ; preds = %for.body
; CHECK-V2-IC4: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK-V2-IC4: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK-V2-IC4: [[PROF5]] = !{!"branch_weights", i32 1, i32 15}
-; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 2, i32 0}
-; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META4]]}
-; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 1}
-; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK-V2-IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]}
+; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
new file mode 100644
index 0000000..5b8acee
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -0,0 +1,395 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
+
+target triple = "aarch64-linux-gnu"
+
+; Original loop has trip count 16, but contains interleave groups with gaps, so
+; the last iteration must execute in the scalar loop. Thus the vector loop can
+; only execute up to 15 iterations.
+define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 {
+; CHECK-LABEL: define i64 @vector_loop_with_remaining_iterations(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ITER_CHECK:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP9]], align 1
+; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i32> [[TMP7]] to <16 x i64>
+; CHECK-NEXT: [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]])
+; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP15]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK: [[VEC_EPILOG_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 2
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 17, [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 [[N_MOD_VF]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 17, [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 2
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP26:%.*]] = mul <vscale x 2 x i64> [[TMP25]], splat (i64 1)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> [[BROADCAST_SPLAT4]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = mul i64 1, [[TMP21]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP27]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <vscale x 2 x i64> [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
+; CHECK-NEXT: [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
+; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
+; CHECK-NEXT: [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i32 0
+; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP33]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = zext <vscale x 2 x i32> [[TMP31]] to <vscale x 2 x i64>
+; CHECK-NEXT: [[TMP35]] = or <vscale x 2 x i64> [[VEC_PHI8]], [[TMP34]]
+; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP21]]
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]])
+; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP37]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL10]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3
+; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1
+; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; CHECK-NEXT: [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT: [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]])
+; CHECK-NEXT: [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT: [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]])
+; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1
+; CHECK-NEXT: [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64
+; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+ %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3
+ %l = load i8, ptr %gep.src.i.i, align 1
+ %l.ext = zext i8 %l to i32
+ %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+ %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext)
+ %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+ %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0)
+ %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv
+ store i8 0, ptr %gep.dst, align 1
+ %min.ext = zext i32 %min.1 to i64
+ %red.next = or i64 %red, %min.ext
+ %iv.next = add i64 %iv, 1
+ %exitcond.not.i.i = icmp eq i64 %iv.next, 17
+ br i1 %exitcond.not.i.i, label %exit, label %loop
+
+exit:
+ ret i64 %red.next
+}
+
+; Original loop has trip count 17, but contains interleave groups with gaps, so
+; the last iteration must execute in the scalar loop. Thus the vector loop can
+; only execute up to 16 iterations.
+define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 {
+; CHECK-LABEL: define i64 @main_vector_loop_fixed_with_no_remaining_iterations(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ITER_CHECK:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
+; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP27]], align 1
+; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i32> [[TMP7]] to <16 x i64>
+; CHECK-NEXT: [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]])
+; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP15]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK: [[VEC_EPILOG_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 2
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 17, [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 [[N_MOD_VF]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 17, [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 2
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP38:%.*]] = mul <vscale x 2 x i64> [[TMP25]], splat (i64 1)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> [[BROADCAST_SPLAT4]], [[TMP38]]
+; CHECK-NEXT: [[TMP39:%.*]] = mul i64 1, [[TMP21]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP39]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <vscale x 2 x i64> [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
+; CHECK-NEXT: [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
+; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
+; CHECK-NEXT: [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i32 0
+; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP33]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = zext <vscale x 2 x i32> [[TMP31]] to <vscale x 2 x i64>
+; CHECK-NEXT: [[TMP35]] = or <vscale x 2 x i64> [[VEC_PHI8]], [[TMP34]]
+; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP21]]
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]])
+; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP37]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL10]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3
+; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1
+; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; CHECK-NEXT: [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT: [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]])
+; CHECK-NEXT: [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT: [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]])
+; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1
+; CHECK-NEXT: [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64
+; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+ %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3
+ %l = load i8, ptr %gep.src.i.i, align 1
+ %l.ext = zext i8 %l to i32
+ %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+ %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext)
+ %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+ %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0)
+ %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv
+ store i8 0, ptr %gep.dst, align 1
+ %min.ext = zext i32 %min.1 to i64
+ %red.next = or i64 %red, %min.ext
+ %iv.next = add i64 %iv, 1
+ %exitcond.not.i.i = icmp eq i64 %iv.next, 17
+ br i1 %exitcond.not.i.i, label %exit, label %loop
+
+exit:
+ ret i64 %red.next
+}
+
+; Test case for https://github.com/llvm/llvm-project/issues/149726.
+define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, ptr noalias %J, ptr noalias %K, ptr %L) #1 {
+; CHECK-LABEL: define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]], ptr noalias [[E:%.*]], ptr noalias [[F:%.*]], ptr noalias [[G:%.*]], ptr noalias [[H:%.*]], ptr noalias [[I:%.*]], ptr noalias [[J:%.*]], ptr noalias [[K:%.*]], ptr [[L:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT: [[IV:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV]]
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[GEP_J]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i64> [[STRIDED_VEC]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
+; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP6]], align 2
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
+; CHECK-NEXT: store i16 [[TMP11]], ptr [[TMP7]], align 2
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT: store i16 [[TMP12]], ptr [[TMP8]], align 2
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; CHECK-NEXT: store i16 [[TMP13]], ptr [[TMP9]], align 2
+; CHECK-NEXT: store i64 0, ptr [[A]], align 8
+; CHECK-NEXT: store i64 0, ptr [[B]], align 8
+; CHECK-NEXT: store i64 0, ptr [[C]], align 8
+; CHECK-NEXT: store i64 0, ptr [[D]], align 8
+; CHECK-NEXT: store i64 0, ptr [[E]], align 8
+; CHECK-NEXT: store i64 0, ptr [[F]], align 8
+; CHECK-NEXT: store i64 0, ptr [[G]], align 8
+; CHECK-NEXT: store i64 0, ptr [[H]], align 8
+; CHECK-NEXT: store i64 0, ptr [[I]], align 8
+; CHECK-NEXT: store i64 0, ptr [[L]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_J1:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV1]]
+; CHECK-NEXT: [[L_J:%.*]] = load i64, ptr [[GEP_J1]], align 8
+; CHECK-NEXT: [[L_TRUNC:%.*]] = trunc i64 [[L_J]] to i16
+; CHECK-NEXT: [[GEP_K:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV1]]
+; CHECK-NEXT: store i16 [[L_TRUNC]], ptr [[GEP_K]], align 2
+; CHECK-NEXT: store i64 0, ptr [[A]], align 8
+; CHECK-NEXT: store i64 0, ptr [[B]], align 8
+; CHECK-NEXT: store i64 0, ptr [[C]], align 8
+; CHECK-NEXT: store i64 0, ptr [[D]], align 8
+; CHECK-NEXT: store i64 0, ptr [[E]], align 8
+; CHECK-NEXT: store i64 0, ptr [[F]], align 8
+; CHECK-NEXT: store i64 0, ptr [[G]], align 8
+; CHECK-NEXT: store i64 0, ptr [[H]], align 8
+; CHECK-NEXT: store i64 0, ptr [[I]], align 8
+; CHECK-NEXT: store i64 0, ptr [[L]], align 8
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 2
+; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV1]], 14
+; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %gep.J = getelementptr i64, ptr %J, i64 %iv
+ %l.J = load i64, ptr %gep.J, align 8
+ %l.trunc = trunc i64 %l.J to i16
+ %gep.K = getelementptr i16, ptr %K, i64 %iv
+ store i16 %l.trunc, ptr %gep.K, align 2
+ store i64 0, ptr %A, align 8
+ store i64 0, ptr %B, align 8
+ store i64 0, ptr %C, align 8
+ store i64 0, ptr %D, align 8
+ store i64 0, ptr %E, align 8
+ store i64 0, ptr %F, align 8
+ store i64 0, ptr %G, align 8
+ store i64 0, ptr %H, align 8
+ store i64 0, ptr %I, align 8
+ store i64 0, ptr %L, align 8
+ %iv.next = add i64 %iv, 2
+ %ec = icmp ult i64 %iv, 14
+ br i1 %ec, label %loop, label %exit, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+declare i32 @llvm.umin.i32(i32, i32)
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+attributes #0 = { "target-cpu"="neoverse-512tvb" }
+attributes #1 = { "target-cpu"="grace" }
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 400b031..7090ae8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -7,11 +7,7 @@ target triple = "aarch64-none-unknown-elf"
define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-LABEL: define i32 @dotp(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: iter.check:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK: vector.main.loop.iter.check:
+; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -33,64 +29,8 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK: vec.epilog.ph:
-; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP15]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 4
-; CHECK-NEXT: [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX2]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
-; CHECK-NEXT: [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX2]]
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
-; CHECK-NEXT: [[TMP25:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
-; CHECK-NEXT: [[TMP26:%.*]] = mul <vscale x 4 x i32> [[TMP25]], [[TMP22]]
-; CHECK-NEXT: [[TMP27]] = add <vscale x 4 x i32> [[TMP26]], [[VEC_PHI3]]
-; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]]
-; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP27]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
-; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
-; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
-; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: for.exit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[ADD_LCSSA]]
+; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK: scalar.ph:
;
entry:
br label %for.body
@@ -142,7 +82,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]]
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]]
@@ -174,7 +114,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]]
; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
@@ -198,7 +138,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0
; CHECK-NEXT: [[CMP_IV:%.*]] = icmp ne i64 [[ACCUM1]], -1
; CHECK-NEXT: [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: while.end.loopexit:
; CHECK-NEXT: [[RESULT:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY1]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret void
@@ -557,7 +497,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
; CHECK-NEXT: [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-NEXT: br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 14725e0..14a73db 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1816,13 +1816,12 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -1845,7 +1844,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
;
@@ -1854,13 +1853,13 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]]
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -1897,7 +1896,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
;
@@ -1906,19 +1905,19 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
-; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]]
; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
-; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
@@ -1927,15 +1926,15 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0
; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
-; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP9]]
-; CHECK-MAXBW-NEXT: [[TMP19]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP14]]
+; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP17]], [[TMP9]]
+; CHECK-MAXBW-NEXT: [[TMP14]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP13]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP19]])
-; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP14]])
+; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-MAXBW: scalar.ph:
;
@@ -1954,7 +1953,7 @@ for.body: ; preds = %entry, %for.body
%conv3 = zext i8 %1 to i64
%mul = mul nuw nsw i64 %conv3, %conv
%add = add i64 %sum, %mul
- %exitcond.not = icmp eq i64 %i.iv.next, 16
+ %exitcond.not = icmp eq i64 %i.iv.next, 41
br i1 %exitcond.not, label %exit, label %for.body
exit: ; preds = %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
deleted file mode 100644
index d85bc48..0000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
+++ /dev/null
@@ -1,146 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
-
-target triple = "aarch64-linux-gnu"
-
-define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 {
-; CHECK-LABEL: define i64 @main_vector_loop_fixed_with_no_remaining_iterations(
-; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ITER_CHECK:.*]]:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP3]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
-; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
-; CHECK-NEXT: br i1 true, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
-; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
-; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3
-; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <64 x i8> [[WIDE_VEC2]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[STRIDED_VEC3]] to <16 x i32>
-; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP0]], <16 x i32> [[TMP6]])
-; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP8]])
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP12]], align 1
-; CHECK-NEXT: [[TMP15:%.*]] = zext <16 x i32> [[TMP10]] to <16 x i64>
-; CHECK-NEXT: [[TMP17]] = or <16 x i64> [[VEC_PHI1]], [[TMP15]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP17]])
-; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]]
-; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP14]]
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
-; CHECK: [[VEC_EPILOG_PH]]:
-; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP31]], 2
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP16]]
-; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP32]], i64 [[TMP16]], i64 [[N_MOD_VF]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 16, [[TMP36]]
-; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 2
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP21:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
-; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
-; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP25:%.*]] = mul <vscale x 2 x i64> [[TMP24]], splat (i64 1)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP25]]
-; CHECK-NEXT: [[TMP37:%.*]] = mul i64 1, [[TMP20]]
-; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP37]], i64 0
-; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT4]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
-; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 2 x i64> [ [[TMP21]], %[[VEC_EPILOG_PH]] ], [ [[TMP34:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP38:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP38]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
-; CHECK-NEXT: [[TMP28:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
-; CHECK-NEXT: [[TMP29:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP22]], <vscale x 2 x i32> [[TMP28]])
-; CHECK-NEXT: [[TMP39:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX6]]
-; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
-; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP27]], align 1
-; CHECK-NEXT: [[TMP33:%.*]] = zext <vscale x 2 x i32> [[TMP39]] to <vscale x 2 x i64>
-; CHECK-NEXT: [[TMP34]] = or <vscale x 2 x i64> [[VEC_PHI6]], [[TMP33]]
-; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX6]], [[TMP20]]
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT5]]
-; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP35]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP34]])
-; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]]
-; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i64 [ [[TMP30]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL8]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX9]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3
-; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1
-; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32
-; CHECK-NEXT: [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
-; CHECK-NEXT: [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]])
-; CHECK-NEXT: [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
-; CHECK-NEXT: [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]])
-; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1
-; CHECK-NEXT: [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64
-; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]]
-;
-entry:
- br label %loop
-
-loop:
- %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
- %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
- %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3
- %l = load i8, ptr %gep.src.i.i, align 1
- %l.ext = zext i8 %l to i32
- %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false)
- %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext)
- %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false)
- %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0)
- %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv
- store i8 0, ptr %gep.dst, align 1
- %min.ext = zext i32 %min.1 to i64
- %red.next = or i64 %red, %min.ext
- %iv.next = add i64 %iv, 1
- %exitcond.not.i.i = icmp eq i64 %iv.next, 16
- br i1 %exitcond.not.i.i, label %exit, label %loop
-
-exit:
- ret i64 %red.next
-}
-
-declare i32 @llvm.umin.i32(i32, i32)
-
-declare i32 @llvm.abs.i32(i32, i1 immarg)
-
-attributes #0 = { "target-cpu"="neoverse-512tvb" }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index 45357dd..dbe6f27 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 2
; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=scalar-epilogue %s 2>&1 | FileCheck %s -check-prefix=SCALAR_EPILOGUE
-; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_TAIL_FOLDING
-; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data-with-evl %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_EVL
+; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_DATA
+; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data-with-evl %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_DATA-WITH-EVL
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
@@ -55,105 +55,105 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SCALAR_EPILOGUE: scalar.ph:
;
-; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided_factor2
-; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] {
-; PREDICATED_TAIL_FOLDING-NEXT: entry:
-; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDICATED_TAIL_FOLDING: vector.ph:
-; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
-; PREDICATED_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]]
-; PREDICATED_TAIL_FOLDING: vector.body:
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; PREDICATED_TAIL_FOLDING: middle.block:
-; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]]
-; PREDICATED_TAIL_FOLDING: scalar.ph:
+; PREDICATED_DATA-LABEL: define void @masked_strided_factor2
+; PREDICATED_DATA-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] {
+; PREDICATED_DATA-NEXT: entry:
+; PREDICATED_DATA-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDICATED_DATA: vector.ph:
+; PREDICATED_DATA-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_DATA-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
+; PREDICATED_DATA-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
+; PREDICATED_DATA-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; PREDICATED_DATA-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; PREDICATED_DATA-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; PREDICATED_DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_DATA-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_DATA-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; PREDICATED_DATA: vector.body:
+; PREDICATED_DATA-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024)
+; PREDICATED_DATA-NEXT: [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_DATA-NEXT: [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
+; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]]
+; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
+; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
+; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
+; PREDICATED_DATA-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_DATA-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
+; PREDICATED_DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_DATA-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDICATED_DATA-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDICATED_DATA: middle.block:
+; PREDICATED_DATA-NEXT: br label [[FOR_END:%.*]]
+; PREDICATED_DATA: scalar.ph:
;
-; PREDICATED_EVL-LABEL: define void @masked_strided_factor2
-; PREDICATED_EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] {
-; PREDICATED_EVL-NEXT: entry:
-; PREDICATED_EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDICATED_EVL: vector.ph:
-; PREDICATED_EVL-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_EVL-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_EVL-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
-; PREDICATED_EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
-; PREDICATED_EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
-; PREDICATED_EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; PREDICATED_EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_EVL-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
-; PREDICATED_EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
-; PREDICATED_EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_EVL-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; PREDICATED_EVL: vector.body:
-; PREDICATED_EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_EVL-NEXT: [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]]
-; PREDICATED_EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
-; PREDICATED_EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; PREDICATED_EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_EVL-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
-; PREDICATED_EVL-NEXT: [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]]
-; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP9]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
-; PREDICATED_EVL-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
-; PREDICATED_EVL-NEXT: [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
-; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> align 1 [[TMP15]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
-; PREDICATED_EVL-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
-; PREDICATED_EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; PREDICATED_EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; PREDICATED_EVL: middle.block:
-; PREDICATED_EVL-NEXT: br label [[FOR_END:%.*]]
-; PREDICATED_EVL: scalar.ph:
+; PREDICATED_DATA-WITH-EVL-LABEL: define void @masked_strided_factor2
+; PREDICATED_DATA-WITH-EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] {
+; PREDICATED_DATA-WITH-EVL-NEXT: entry:
+; PREDICATED_DATA-WITH-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDICATED_DATA-WITH-EVL: vector.ph:
+; PREDICATED_DATA-WITH-EVL-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
+; PREDICATED_DATA-WITH-EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
+; PREDICATED_DATA-WITH-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_DATA-WITH-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; PREDICATED_DATA-WITH-EVL: vector.body:
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
+; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP9]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
+; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> align 1 [[TMP15]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDICATED_DATA-WITH-EVL: middle.block:
+; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]]
+; PREDICATED_DATA-WITH-EVL: scalar.ph:
;
entry:
%conv = zext i8 %guard to i32
@@ -256,137 +256,137 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SCALAR_EPILOGUE: scalar.ph:
;
-; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided_factor4
-; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
-; PREDICATED_TAIL_FOLDING-NEXT: entry:
-; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDICATED_TAIL_FOLDING: vector.ph:
-; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
-; PREDICATED_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]]
-; PREDICATED_TAIL_FOLDING: vector.body:
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> [[TMP24]], i32 1, <vscale x 16 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]]
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> [[TMP26]], i32 1, <vscale x 16 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]]
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> [[TMP28]], i32 1, <vscale x 16 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> [[TMP30]], i32 1, <vscale x 16 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; PREDICATED_TAIL_FOLDING: middle.block:
-; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]]
-; PREDICATED_TAIL_FOLDING: scalar.ph:
+; PREDICATED_DATA-LABEL: define void @masked_strided_factor4
+; PREDICATED_DATA-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
+; PREDICATED_DATA-NEXT: entry:
+; PREDICATED_DATA-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDICATED_DATA: vector.ph:
+; PREDICATED_DATA-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_DATA-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
+; PREDICATED_DATA-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
+; PREDICATED_DATA-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; PREDICATED_DATA-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; PREDICATED_DATA-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; PREDICATED_DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_DATA-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_DATA-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; PREDICATED_DATA: vector.body:
+; PREDICATED_DATA-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024)
+; PREDICATED_DATA-NEXT: [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_DATA-NEXT: [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2)
+; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2)
+; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3)
+; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
+; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]]
+; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]]
+; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_DATA-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_DATA-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_DATA-NEXT: [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
+; PREDICATED_DATA-NEXT: [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
+; PREDICATED_DATA-NEXT: [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
+; PREDICATED_DATA-NEXT: [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
+; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> [[TMP24]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_DATA-NEXT: [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]]
+; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> [[TMP26]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_DATA-NEXT: [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]]
+; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> [[TMP28]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_DATA-NEXT: [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
+; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> [[TMP30]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_DATA-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
+; PREDICATED_DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_DATA-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDICATED_DATA-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDICATED_DATA: middle.block:
+; PREDICATED_DATA-NEXT: br label [[FOR_END:%.*]]
+; PREDICATED_DATA: scalar.ph:
;
-; PREDICATED_EVL-LABEL: define void @masked_strided_factor4
-; PREDICATED_EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
-; PREDICATED_EVL-NEXT: entry:
-; PREDICATED_EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDICATED_EVL: vector.ph:
-; PREDICATED_EVL-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_EVL-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_EVL-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
-; PREDICATED_EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
-; PREDICATED_EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
-; PREDICATED_EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; PREDICATED_EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_EVL-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
-; PREDICATED_EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
-; PREDICATED_EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_EVL-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; PREDICATED_EVL: vector.body:
-; PREDICATED_EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_EVL-NEXT: [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]]
-; PREDICATED_EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
-; PREDICATED_EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; PREDICATED_EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_EVL-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2)
-; PREDICATED_EVL-NEXT: [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
-; PREDICATED_EVL-NEXT: [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2)
-; PREDICATED_EVL-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3)
-; PREDICATED_EVL-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]]
-; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]]
-; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP16]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
-; PREDICATED_EVL-NEXT: [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; PREDICATED_EVL-NEXT: [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
-; PREDICATED_EVL-NEXT: [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
-; PREDICATED_EVL-NEXT: [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
-; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> align 1 [[TMP24]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]]
-; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> align 1 [[TMP26]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]]
-; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> align 1 [[TMP28]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
-; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> align 1 [[TMP30]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
-; PREDICATED_EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_EVL-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; PREDICATED_EVL-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; PREDICATED_EVL: middle.block:
-; PREDICATED_EVL-NEXT: br label [[FOR_END:%.*]]
-; PREDICATED_EVL: scalar.ph:
+; PREDICATED_DATA-WITH-EVL-LABEL: define void @masked_strided_factor4
+; PREDICATED_DATA-WITH-EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
+; PREDICATED_DATA-WITH-EVL-NEXT: entry:
+; PREDICATED_DATA-WITH-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDICATED_DATA-WITH-EVL: vector.ph:
+; PREDICATED_DATA-WITH-EVL-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
+; PREDICATED_DATA-WITH-EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
+; PREDICATED_DATA-WITH-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_DATA-WITH-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; PREDICATED_DATA-WITH-EVL: vector.body:
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
+; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2)
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2)
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3)
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP16]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
+; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> align 1 [[TMP24]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]]
+; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> align 1 [[TMP26]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]]
+; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> align 1 [[TMP28]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
+; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> align 1 [[TMP30]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PREDICATED_DATA-WITH-EVL: middle.block:
+; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]]
+; PREDICATED_DATA-WITH-EVL: scalar.ph:
;
entry:
%conv = zext i8 %guard to i32
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 6c57d2f..e2641ab 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -133,15 +133,15 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8)
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 8, i32 4, i1 true)
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8:%.*]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP9]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11:%.*]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr align 1 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_END:%.*]]
@@ -358,3 +358,64 @@ for.end: ; preds = %for.body
attributes #0 = { "target-features"="+v,+d" vscale_range(2, 1024) }
+; This is a non-power-of-2 low trip count, so we will try to tail-fold this. But
+; the reduction is a multiply which is only legal for fixed-length VFs. But
+; fixed-length VFs aren't legal for the default tail-folding style
+; data-with-evl, so make sure we gracefully fall back to data-without-lane-mask.
+
+define i8 @mul_non_pow_2_low_trip_count(ptr noalias %a) {
+; CHECK-LABEL: @mul_non_pow_2_low_trip_count(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i8> [ <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ule <16 x i64> [[VEC_IV]], splat (i64 9)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT: [[TMP2]] = mul <16 x i8> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP2]], <16 x i8> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> [[TMP3]])
+; CHECK-NEXT: br label [[FOR_END:%.*]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ 2, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: [[MUL]] = mul i8 [[TMP5]], [[RDX]]
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 10
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: for.end:
+; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i8 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i8 [[MUL_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i8 [ 2, %entry ], [ %mul, %for.body ]
+ %gep = getelementptr i8, ptr %a, i64 %iv
+ %0 = load i8, ptr %gep
+ %mul = mul i8 %0, %rdx
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 10
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret i8 %mul
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index a1201dcf..0228811 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -7,29 +7,49 @@ define void @test(ptr %p, i64 %a, i8 %b) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH1:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[A]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP0:%.*]] = shl <16 x i64> [[BROADCAST_SPLAT2]], splat (i64 48)
-; CHECK-NEXT: [[TMP1:%.*]] = ashr <16 x i64> [[TMP0]], splat (i64 52)
-; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], 1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 9, [[TMP2]]
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 2
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[B]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = shl <vscale x 2 x i64> [[BROADCAST_SPLAT2]], splat (i64 48)
+; CHECK-NEXT: [[TMP6:%.*]] = ashr <vscale x 2 x i64> [[TMP5]], splat (i64 52)
+; CHECK-NEXT: [[TMP7:%.*]] = trunc <vscale x 2 x i64> [[TMP6]] to <vscale x 2 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = zext <vscale x 2 x i8> [[BROADCAST_SPLAT]] to <vscale x 2 x i32>
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 2 x i32> [[TMP9]], splat (i32 1)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i32> zeroinitializer, [[TMP10]]
; CHECK-NEXT: br label [[FOR_COND:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 9)
-; CHECK-NEXT: [[TMP4:%.*]] = icmp sge <16 x i32> [[VEC_IND]], splat (i32 2)
-; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i1> [[TMP4]], <16 x i1> zeroinitializer
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = shl <16 x i32> [[PREDPHI]], splat (i32 8)
-; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i8>
-; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[TMP8]], i32 15
-; CHECK-NEXT: store i8 [[TMP40]], ptr [[P]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16)
-; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_COND]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub i32 9, [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT: [[TMP12:%.*]] = mul i32 1, [[TMP11]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP12]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ule <vscale x 2 x i32> [[VEC_IND]], splat (i32 8)
+; CHECK-NEXT: [[TMP14:%.*]] = icmp sge <vscale x 2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i32> [[TMP7]], <vscale x 2 x i32> [[TMP8]]
+; CHECK-NEXT: [[TMP16:%.*]] = shl <vscale x 2 x i32> [[PREDPHI]], splat (i32 8)
+; CHECK-NEXT: [[TMP17:%.*]] = trunc <vscale x 2 x i32> [[TMP16]] to <vscale x 2 x i8>
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> [[TMP17]], <vscale x 2 x ptr> align 1 [[BROADCAST_SPLAT4]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP11]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[EXIT1:%.*]]
; CHECK: scalar.ph:
@@ -52,7 +72,7 @@ define void @test(ptr %p, i64 %a, i8 %b) {
; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHL_I32]] to i8
; CHECK-NEXT: store i8 [[TRUNC]], ptr [[P]], align 1
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV]], 8
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND1]], label [[EXIT1]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND1]], label [[EXIT1]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
@@ -84,8 +104,9 @@ exit: ; preds = %for.body
ret void
}
;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
deleted file mode 100644
index 4844c2f..0000000
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ /dev/null
@@ -1,690 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-;; This is the loop in c++ being vectorize in this file with
-;; vector.reverse
-;; #pragma clang loop vectorize_width(4, scalable)
-;; for (int i = N-1; i >= 0; --i)
-;; a[i] = b[i] + 1.0;
-
-; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S < %s \
-; RUN: | FileCheck --check-prefix=RV64 %s
-
-; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v -S < %s \
-; RUN: | FileCheck --check-prefix=RV32 %s
-
-; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -force-vector-interleave=2 -S < %s \
-; RUN: | FileCheck --check-prefix=RV64-UF2 %s
-
-define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
-; RV64-LABEL: define void @vector_reverse_i32(
-; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; RV64-NEXT: [[ENTRY:.*]]:
-; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64: [[VECTOR_PH]]:
-; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
-; RV64-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
-; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV64-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
-; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
-; RV64: [[VECTOR_BODY]]:
-; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
-; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]]
-; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]]
-; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
-; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV64-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
-; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]]
-; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]]
-; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; RV64-NEXT: store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP19]], align 4
-; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; RV64: [[MIDDLE_BLOCK]]:
-; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
-; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64: [[SCALAR_PH]]:
-; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-NEXT: br label %[[FOR_BODY:.*]]
-; RV64: [[FOR_BODY]]:
-; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP21]], 1
-; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
-; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; RV64: [[EXIT]]:
-; RV64-NEXT: ret void
-;
-; RV32-LABEL: define void @vector_reverse_i32(
-; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; RV32-NEXT: [[ENTRY:.*]]:
-; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV32: [[VECTOR_PH]]:
-; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
-; RV32-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
-; RV32-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV32-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
-; RV32-NEXT: br label %[[VECTOR_BODY:.*]]
-; RV32: [[VECTOR_BODY]]:
-; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
-; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1
-; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
-; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]]
-; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]]
-; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV32-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1
-; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
-; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]]
-; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]]
-; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; RV32-NEXT: store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP21]], align 4
-; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; RV32: [[MIDDLE_BLOCK]]:
-; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
-; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV32: [[SCALAR_PH]]:
-; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV32-NEXT: br label %[[FOR_BODY:.*]]
-; RV32: [[FOR_BODY]]:
-; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP23]], 1
-; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
-; RV32-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
-; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; RV32: [[EXIT]]:
-; RV32-NEXT: ret void
-;
-; RV64-UF2-LABEL: define void @vector_reverse_i32(
-; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; RV64-UF2-NEXT: [[ENTRY:.*]]:
-; RV64-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
-; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64-UF2: [[VECTOR_PH]]:
-; RV64-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
-; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
-; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
-; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
-; RV64-UF2: [[VECTOR_BODY]]:
-; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]]
-; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
-; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]]
-; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]]
-; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
-; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]]
-; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
-; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP18]], align 4
-; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD1]])
-; RV64-UF2-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV64-UF2-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1)
-; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
-; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
-; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
-; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
-; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP25]], align 4
-; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP29]], align 4
-; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; RV64-UF2: [[MIDDLE_BLOCK]]:
-; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
-; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64-UF2: [[SCALAR_PH]]:
-; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
-; RV64-UF2: [[FOR_BODY]]:
-; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV64-UF2-NEXT: [[ADD:%.*]] = add i32 [[TMP31]], 1
-; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
-; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; RV64-UF2: [[EXIT]]:
-; RV64-UF2-NEXT: ret void
-;
-entry:
- br label %for.body
-
-for.body:
- %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
- %iv.next = add nsw i64 %dec.iv, -1
- %arrayidx.b = getelementptr inbounds i32, ptr %B, i64 %iv.next
- %0 = load i32, ptr %arrayidx.b, align 4
- %add = add i32 %0, 1
- %arrayidx.a = getelementptr inbounds i32, ptr %A, i64 %iv.next
- store i32 %add, ptr %arrayidx.a, align 4
- %cmp = icmp ugt i64 %dec.iv, 1
- br i1 %cmp, label %for.body, label %exit, !llvm.loop !0
-
-exit:
- ret void
-}
-
-define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
-; RV64-LABEL: define void @vector_reverse_f32(
-; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV64-NEXT: [[ENTRY:.*]]:
-; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64: [[VECTOR_PH]]:
-; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
-; RV64-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
-; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV64-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
-; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
-; RV64: [[VECTOR_BODY]]:
-; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
-; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]]
-; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]]
-; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
-; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV64-NEXT: [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
-; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
-; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]]
-; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]]
-; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]])
-; RV64-NEXT: store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP19]], align 4
-; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; RV64: [[MIDDLE_BLOCK]]:
-; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
-; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64: [[SCALAR_PH]]:
-; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-NEXT: br label %[[FOR_BODY:.*]]
-; RV64: [[FOR_BODY]]:
-; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP21]], 1.000000e+00
-; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
-; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; RV64: [[EXIT]]:
-; RV64-NEXT: ret void
-;
-; RV32-LABEL: define void @vector_reverse_f32(
-; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV32-NEXT: [[ENTRY:.*]]:
-; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV32: [[VECTOR_PH]]:
-; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
-; RV32-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
-; RV32-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV32-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
-; RV32-NEXT: br label %[[VECTOR_BODY:.*]]
-; RV32: [[VECTOR_BODY]]:
-; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
-; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1
-; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
-; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]]
-; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]]
-; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
-; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV32-NEXT: [[TMP15:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
-; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1
-; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
-; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]]
-; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]]
-; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]])
-; RV32-NEXT: store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP21]], align 4
-; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; RV32: [[MIDDLE_BLOCK]]:
-; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
-; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV32: [[SCALAR_PH]]:
-; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV32-NEXT: br label %[[FOR_BODY:.*]]
-; RV32: [[FOR_BODY]]:
-; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP23]], 1.000000e+00
-; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
-; RV32-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
-; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; RV32: [[EXIT]]:
-; RV32-NEXT: ret void
-;
-; RV64-UF2-LABEL: define void @vector_reverse_f32(
-; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV64-UF2-NEXT: [[ENTRY:.*]]:
-; RV64-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
-; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64-UF2: [[VECTOR_PH]]:
-; RV64-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
-; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
-; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
-; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
-; RV64-UF2: [[VECTOR_BODY]]:
-; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
-; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
-; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]]
-; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]]
-; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
-; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]]
-; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]]
-; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP18]], align 4
-; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1]])
-; RV64-UF2-NEXT: [[TMP19:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
-; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
-; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
-; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP19]])
-; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP25]], align 4
-; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP20]])
-; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP29]], align 4
-; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; RV64-UF2: [[MIDDLE_BLOCK]]:
-; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
-; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64-UF2: [[SCALAR_PH]]:
-; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
-; RV64-UF2: [[FOR_BODY]]:
-; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV64-UF2-NEXT: [[FADD:%.*]] = fadd float [[TMP31]], 1.000000e+00
-; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
-; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; RV64-UF2: [[EXIT]]:
-; RV64-UF2-NEXT: ret void
-;
-entry:
- br label %for.body
-
-for.body:
- %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
- %iv.next = add nsw i64 %dec.iv, -1
- %arrayidx.b = getelementptr inbounds float, ptr %B, i64 %iv.next
- %0 = load float, ptr %arrayidx.b, align 4
- %fadd = fadd float %0, 1.000000e+00
- %arrayidx.a = getelementptr inbounds float, ptr %A, i64 %iv.next
- store float %fadd, ptr %arrayidx.a, align 4
- %cmp = icmp ugt i64 %dec.iv, 1
- br i1 %cmp, label %for.body, label %exit, !llvm.loop !0
-
-exit:
- ret void
-}
-
-define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
-; RV64-LABEL: define void @vector_reverse_irregular_type(
-; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV64-NEXT: [[ENTRY:.*]]:
-; RV64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64: [[VECTOR_PH]]:
-; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
-; RV64: [[VECTOR_BODY]]:
-; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-NEXT: [[DEC_IV:%.*]] = add i64 [[OFFSET_IDX]], 0
-; RV64-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
-; RV64-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
-; RV64-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
-; RV64-NEXT: [[IV_NEXT:%.*]] = add nsw i64 [[DEC_IV]], -1
-; RV64-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
-; RV64-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
-; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
-; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
-; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
-; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
-; RV64-NEXT: [[TMP0:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
-; RV64-NEXT: [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
-; RV64-NEXT: [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
-; RV64-NEXT: [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
-; RV64-NEXT: [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP0]], i32 0
-; RV64-NEXT: [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
-; RV64-NEXT: [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
-; RV64-NEXT: [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
-; RV64-NEXT: [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
-; RV64-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
-; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
-; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
-; RV64-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
-; RV64-NEXT: store i7 [[TMP25]], ptr [[TMP21]], align 1
-; RV64-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
-; RV64-NEXT: store i7 [[TMP26]], ptr [[TMP22]], align 1
-; RV64-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
-; RV64-NEXT: store i7 [[TMP27]], ptr [[TMP23]], align 1
-; RV64-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
-; RV64-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1
-; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; RV64-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
-; RV64-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; RV64: [[MIDDLE_BLOCK]]:
-; RV64-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64: [[SCALAR_PH]]:
-; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-NEXT: br label %[[FOR_BODY:.*]]
-; RV64: [[FOR_BODY]]:
-; RV64-NEXT: [[DEC_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT: [[IV_NEXT1]] = add nsw i64 [[DEC_IV1]], -1
-; RV64-NEXT: [[ARRAYIDX_B1:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT1]]
-; RV64-NEXT: [[TMP30:%.*]] = load i7, ptr [[ARRAYIDX_B1]], align 1
-; RV64-NEXT: [[ADD:%.*]] = add i7 [[TMP30]], 1
-; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT1]]
-; RV64-NEXT: store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
-; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV1]], 1
-; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; RV64: [[EXIT]]:
-; RV64-NEXT: ret void
-;
-; RV32-LABEL: define void @vector_reverse_irregular_type(
-; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV32-NEXT: [[ENTRY:.*]]:
-; RV32-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV32: [[VECTOR_PH]]:
-; RV32-NEXT: br label %[[VECTOR_BODY:.*]]
-; RV32: [[VECTOR_BODY]]:
-; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV32-NEXT: [[DEC_IV:%.*]] = add i64 [[OFFSET_IDX]], 0
-; RV32-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
-; RV32-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
-; RV32-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
-; RV32-NEXT: [[IV_NEXT:%.*]] = add nsw i64 [[DEC_IV]], -1
-; RV32-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
-; RV32-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
-; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
-; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
-; RV32-NEXT: [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
-; RV32-NEXT: [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
-; RV32-NEXT: [[TMP0:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
-; RV32-NEXT: [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
-; RV32-NEXT: [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
-; RV32-NEXT: [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
-; RV32-NEXT: [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP0]], i32 0
-; RV32-NEXT: [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
-; RV32-NEXT: [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
-; RV32-NEXT: [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
-; RV32-NEXT: [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
-; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
-; RV32-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
-; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
-; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
-; RV32-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
-; RV32-NEXT: store i7 [[TMP25]], ptr [[TMP21]], align 1
-; RV32-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
-; RV32-NEXT: store i7 [[TMP26]], ptr [[TMP22]], align 1
-; RV32-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
-; RV32-NEXT: store i7 [[TMP27]], ptr [[TMP23]], align 1
-; RV32-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
-; RV32-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1
-; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; RV32-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
-; RV32-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; RV32: [[MIDDLE_BLOCK]]:
-; RV32-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV32: [[SCALAR_PH]]:
-; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV32-NEXT: br label %[[FOR_BODY:.*]]
-; RV32: [[FOR_BODY]]:
-; RV32-NEXT: [[DEC_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[FOR_BODY]] ]
-; RV32-NEXT: [[IV_NEXT1]] = add nsw i64 [[DEC_IV1]], -1
-; RV32-NEXT: [[ARRAYIDX_B1:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT1]]
-; RV32-NEXT: [[TMP30:%.*]] = load i7, ptr [[ARRAYIDX_B1]], align 1
-; RV32-NEXT: [[ADD:%.*]] = add i7 [[TMP30]], 1
-; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT1]]
-; RV32-NEXT: store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
-; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV1]], 1
-; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; RV32: [[EXIT]]:
-; RV32-NEXT: ret void
-;
-; RV64-UF2-LABEL: define void @vector_reverse_irregular_type(
-; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV64-UF2-NEXT: [[ENTRY:.*]]:
-; RV64-UF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64-UF2: [[VECTOR_PH]]:
-; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
-; RV64-UF2: [[VECTOR_BODY]]:
-; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT: [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 0
-; RV64-UF2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -2
-; RV64-UF2-NEXT: [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], -3
-; RV64-UF2-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], -4
-; RV64-UF2-NEXT: [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], -5
-; RV64-UF2-NEXT: [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], -6
-; RV64-UF2-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], -7
-; RV64-UF2-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP16]], -1
-; RV64-UF2-NEXT: [[TMP2:%.*]] = add nsw i64 [[TMP0]], -1
-; RV64-UF2-NEXT: [[TMP51:%.*]] = add nsw i64 [[TMP17]], -1
-; RV64-UF2-NEXT: [[TMP11:%.*]] = add nsw i64 [[TMP24]], -1
-; RV64-UF2-NEXT: [[TMP59:%.*]] = add nsw i64 [[TMP25]], -1
-; RV64-UF2-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP42]], -1
-; RV64-UF2-NEXT: [[TMP14:%.*]] = add nsw i64 [[TMP43]], -1
-; RV64-UF2-NEXT: [[TMP15:%.*]] = add nsw i64 [[TMP50]], -1
-; RV64-UF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP1]]
-; RV64-UF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP2]]
-; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP51]]
-; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP11]]
-; RV64-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP59]]
-; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP13]]
-; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP14]]
-; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP15]]
-; RV64-UF2-NEXT: [[TMP5:%.*]] = load i7, ptr [[TMP3]], align 1
-; RV64-UF2-NEXT: [[TMP6:%.*]] = load i7, ptr [[TMP4]], align 1
-; RV64-UF2-NEXT: [[TMP26:%.*]] = load i7, ptr [[TMP18]], align 1
-; RV64-UF2-NEXT: [[TMP27:%.*]] = load i7, ptr [[TMP19]], align 1
-; RV64-UF2-NEXT: [[TMP28:%.*]] = insertelement <4 x i7> poison, i7 [[TMP5]], i32 0
-; RV64-UF2-NEXT: [[TMP29:%.*]] = insertelement <4 x i7> [[TMP28]], i7 [[TMP6]], i32 1
-; RV64-UF2-NEXT: [[TMP30:%.*]] = insertelement <4 x i7> [[TMP29]], i7 [[TMP26]], i32 2
-; RV64-UF2-NEXT: [[TMP31:%.*]] = insertelement <4 x i7> [[TMP30]], i7 [[TMP27]], i32 3
-; RV64-UF2-NEXT: [[TMP32:%.*]] = load i7, ptr [[TMP20]], align 1
-; RV64-UF2-NEXT: [[TMP33:%.*]] = load i7, ptr [[TMP21]], align 1
-; RV64-UF2-NEXT: [[TMP34:%.*]] = load i7, ptr [[TMP22]], align 1
-; RV64-UF2-NEXT: [[TMP35:%.*]] = load i7, ptr [[TMP23]], align 1
-; RV64-UF2-NEXT: [[TMP36:%.*]] = insertelement <4 x i7> poison, i7 [[TMP32]], i32 0
-; RV64-UF2-NEXT: [[TMP37:%.*]] = insertelement <4 x i7> [[TMP36]], i7 [[TMP33]], i32 1
-; RV64-UF2-NEXT: [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2
-; RV64-UF2-NEXT: [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3
-; RV64-UF2-NEXT: [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1)
-; RV64-UF2-NEXT: [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1)
-; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP1]]
-; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP2]]
-; RV64-UF2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP51]]
-; RV64-UF2-NEXT: [[TMP45:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP11]]
-; RV64-UF2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP59]]
-; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]]
-; RV64-UF2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]]
-; RV64-UF2-NEXT: [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]]
-; RV64-UF2-NEXT: [[TMP7:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0
-; RV64-UF2-NEXT: store i7 [[TMP7]], ptr [[TMP9]], align 1
-; RV64-UF2-NEXT: [[TMP8:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1
-; RV64-UF2-NEXT: store i7 [[TMP8]], ptr [[TMP10]], align 1
-; RV64-UF2-NEXT: [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2
-; RV64-UF2-NEXT: store i7 [[TMP52]], ptr [[TMP44]], align 1
-; RV64-UF2-NEXT: [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3
-; RV64-UF2-NEXT: store i7 [[TMP53]], ptr [[TMP45]], align 1
-; RV64-UF2-NEXT: [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0
-; RV64-UF2-NEXT: store i7 [[TMP54]], ptr [[TMP46]], align 1
-; RV64-UF2-NEXT: [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1
-; RV64-UF2-NEXT: store i7 [[TMP55]], ptr [[TMP47]], align 1
-; RV64-UF2-NEXT: [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2
-; RV64-UF2-NEXT: store i7 [[TMP56]], ptr [[TMP48]], align 1
-; RV64-UF2-NEXT: [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3
-; RV64-UF2-NEXT: store i7 [[TMP57]], ptr [[TMP49]], align 1
-; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; RV64-UF2-NEXT: [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016
-; RV64-UF2-NEXT: br i1 [[TMP58]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; RV64-UF2: [[MIDDLE_BLOCK]]:
-; RV64-UF2-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64-UF2: [[SCALAR_PH]]:
-; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 7, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
-; RV64-UF2: [[FOR_BODY]]:
-; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT: [[TMP12:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
-; RV64-UF2-NEXT: [[ADD:%.*]] = add i7 [[TMP12]], 1
-; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT: store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
-; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; RV64-UF2: [[EXIT]]:
-; RV64-UF2-NEXT: ret void
-;
-entry:
- br label %for.body
-
-for.body:
- %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
- %iv.next = add nsw i64 %dec.iv, -1
- %arrayidx.b = getelementptr inbounds i7, ptr %B, i64 %iv.next
- %0 = load i7, ptr %arrayidx.b, align 1
- %add = add i7 %0, 1
- %arrayidx.a = getelementptr inbounds i7, ptr %A, i64 %iv.next
- store i7 %add, ptr %arrayidx.a, align 1
- %cmp = icmp ugt i64 %dec.iv, 1
- br i1 %cmp, label %for.body, label %exit, !llvm.loop !4
-
-exit:
- ret void
-}
-
-!0 = distinct !{!0, !1, !2, !3}
-!1 = !{!"llvm.loop.vectorize.width", i32 4}
-!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
-!3 = !{!"llvm.loop.vectorize.enable", i1 true}
-!4 = distinct !{!4, !1, !3}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index ad445c8..f59ab56 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -1,400 +1,455 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
-; This is the loop in c++ being vectorize in this file with
-;vector.reverse
-; #pragma clang loop vectorize_width(4, scalable)
-; for (int i = N-1; i >= 0; --i)
-; a[i] = b[i] + 1.0;
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "for.body:" --version 5
+;; This is the loop in c++ being vectorize in this file with
+;; vector.reverse
+;; #pragma clang loop vectorize_width(4, scalable)
+;; for (int i = N-1; i >= 0; --i)
+;; a[i] = b[i] + 1.0;
-; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize,dce,instcombine -mtriple riscv64-linux-gnu \
-; RUN: -mattr=+v -debug-only=loop-vectorize,vplan -scalable-vectorization=on \
-; RUN: -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S < %s \
+; RUN: | FileCheck --check-prefix=RV64 %s
+
+; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v -S < %s \
+; RUN: | FileCheck --check-prefix=RV32 %s
+
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; RUN: -force-vector-interleave=2 -S < %s \
+; RUN: | FileCheck --check-prefix=RV64-UF2 %s
+
+define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
+; RV64-LABEL: define void @vector_reverse_i32(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV64-NEXT: [[ENTRY:.*]]:
+; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64: [[VECTOR_PH]]:
+; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
+; RV64-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
+; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV64-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64: [[VECTOR_BODY]]:
+; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]]
+; RV64-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT: [[TMP10:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP10]]
+; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP9]]
+; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]]
+; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
+; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT: [[TMP17:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP17]]
+; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]]
+; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP18]]
+; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; RV64-NEXT: store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP20]], align 4
+; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV64-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64: [[MIDDLE_BLOCK]]:
+; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
+; RV64-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64: [[SCALAR_PH]]:
+; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-NEXT: br label %[[FOR_BODY:.*]]
+; RV64: [[FOR_BODY]]:
+;
+; RV32-LABEL: define void @vector_reverse_i32(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV32-NEXT: [[ENTRY:.*]]:
+; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32: [[VECTOR_PH]]:
+; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
+; RV32-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
+; RV32-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV32-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
+; RV32-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV32: [[VECTOR_BODY]]:
+; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]]
+; RV32-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT: [[TMP10:%.*]] = mul i32 0, [[TMP9]]
+; RV32-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], 1
+; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP11]]
+; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 [[TMP10]]
+; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]]
+; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV32-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
+; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
+; RV32-NEXT: [[TMP19:%.*]] = sub i32 [[TMP17]], 1
+; RV32-NEXT: [[TMP20:%.*]] = mul i32 -1, [[TMP19]]
+; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]]
+; RV32-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 [[TMP20]]
+; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; RV32-NEXT: store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP22]], align 4
+; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV32-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV32: [[MIDDLE_BLOCK]]:
+; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
+; RV32-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV32: [[SCALAR_PH]]:
+; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV32-NEXT: br label %[[FOR_BODY:.*]]
+; RV32: [[FOR_BODY]]:
+;
+; RV64-UF2-LABEL: define void @vector_reverse_i32(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV64-UF2-NEXT: [[ENTRY:.*]]:
+; RV64-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64-UF2: [[VECTOR_PH]]:
+; RV64-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
+; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
+; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64-UF2: [[VECTOR_BODY]]:
+; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP11:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP11]]
+; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]]
+; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]]
+; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP16]]
+; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP15]]
+; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]]
+; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD1]])
+; RV64-UF2-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP21:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP24:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 -1, [[TMP24]]
+; RV64-UF2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP23]]
+; RV64-UF2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP25]]
+; RV64-UF2-NEXT: [[TMP28:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP29:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP30:%.*]] = mul i64 -1, [[TMP29]]
+; RV64-UF2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP28]]
+; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i64 [[TMP30]]
+; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
+; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP27]], align 4
+; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP32]], align 4
+; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV64-UF2-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64-UF2: [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64-UF2: [[SCALAR_PH]]:
+; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
+; RV64-UF2: [[FOR_BODY]]:
+;
+entry:
+ br label %for.body
+
+for.body:
+ %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
+ %iv.next = add nsw i64 %dec.iv, -1
+ %arrayidx.b = getelementptr inbounds i32, ptr %B, i64 %iv.next
+ %0 = load i32, ptr %arrayidx.b, align 4
+ %add = add i32 %0, 1
+ %arrayidx.a = getelementptr inbounds i32, ptr %A, i64 %iv.next
+ store i32 %add, ptr %arrayidx.a, align 4
+ %cmp = icmp ugt i64 %dec.iv, 1
+ br i1 %cmp, label %for.body, label %exit, !llvm.loop !0
+
+exit:
+ ret void
+}
define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) {
-; CHECK-LABEL: 'vector_reverse_i64'
-; CHECK-NEXT: LV: Loop hints: force=enabled width=vscale x 4 interleave=0
-; CHECK-NEXT: LV: Found a loop: for.body
-; CHECK-NEXT: LV: Found an induction variable.
-; CHECK-NEXT: LV: Found an induction variable.
-; CHECK-NEXT: LV: Did not find one integer induction var.
-; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)!
-; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Found trip count: 0
-; CHECK-NEXT: LV: Found maximum trip count: 4294967295
-; CHECK-NEXT: LV: Scalable vectorization is available
-; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
-; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295.
-; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT: LV: Using user VF vscale x 4.
-; CHECK-NEXT: Creating VPBasicBlock for for.body
-; CHECK-NEXT: VPlan 'Plain CFG
-; CHECK-NEXT: for UF>=1' {
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body.preheader>:
-; CHECK-NEXT: IR %0 = zext i32 %n to i64
-; CHECK-NEXT: Successor(s): for.body
-; CHECK-EMPTY:
-; CHECK-NEXT: for.body:
-; CHECK-NEXT: WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT: WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT: EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1>
-; CHECK-NEXT: EMIT-SCALAR ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom>
-; CHECK-NEXT: EMIT ir<%1> = load ir<%arrayidx>
-; CHECK-NEXT: EMIT ir<%add9> = add ir<%1>, ir<1>
-; CHECK-NEXT: EMIT ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom>
-; CHECK-NEXT: EMIT store ir<%add9>, ir<%arrayidx3>
-; CHECK-NEXT: EMIT ir<%cmp> = icmp ir<%indvars.iv>, ir<1>
-; CHECK-NEXT: EMIT ir<%indvars.iv.next> = add ir<%indvars.iv>, ir<-1>
-; CHECK-NEXT: EMIT branch-on-cond ir<%cmp>
-; CHECK-NEXT: Successor(s): for.body, ir-bb<for.cond.cleanup.loopexit>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT: Live-in vp<%0> = VF
-; CHECK-NEXT: Live-in vp<%1> = VF * UF
-; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
-; CHECK-NEXT: vp<%3> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body.preheader>:
-; CHECK-NEXT: IR %0 = zext i32 %n to i64
-; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64)
-; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%0> + vp<%2> * ir<-1>
-; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%2> * ir<-1>
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1>
-; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0>
-; CHECK-NEXT: WIDEN ir<%1> = load vp<%9>
-; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0>
-; CHECK-NEXT: WIDEN store vp<%10>, ir<%add9>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
-; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
-; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%5>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT: Successor(s): ir-bb<for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph)
-; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from scalar.ph)
-; CHECK-NEXT: IR %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: IR %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: IR %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: IR %1 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT: IR %add9 = add i32 %1, 1
-; CHECK-NEXT: IR %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: IR store i32 %add9, ptr %arrayidx3, align 4
-; CHECK-NEXT: IR %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT: LV(REG): Calculating max register usage:
-; CHECK-NEXT: LV(REG): At #0 Interval # 0
-; CHECK-NEXT: LV(REG): At #1 Interval # 1
-; CHECK-NEXT: LV(REG): At #2 Interval # 2
-; CHECK-NEXT: LV(REG): At #3 Interval # 2
-; CHECK-NEXT: LV(REG): At #4 Interval # 2
-; CHECK-NEXT: LV(REG): At #5 Interval # 2
-; CHECK-NEXT: LV(REG): At #6 Interval # 3
-; CHECK-NEXT: LV(REG): At #7 Interval # 3
-; CHECK-NEXT: LV(REG): At #8 Interval # 3
-; CHECK-NEXT: LV(REG): At #9 Interval # 3
-; CHECK-NEXT: LV(REG): At #10 Interval # 3
-; CHECK-NEXT: LV(REG): At #11 Interval # 3
-; CHECK-NEXT: LV(REG): At #12 Interval # 2
-; CHECK-NEXT: LV(REG): At #13 Interval # 2
-; CHECK-NEXT: LV(REG): VF = vscale x 4
-; CHECK-NEXT: LV(REG): Found max usage: 2 item
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers
-; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
-; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
-; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 24
-; CHECK-NEXT: LV: IC is 1
-; CHECK-NEXT: LV: VF is vscale x 4
-; CHECK-NEXT: LV: Not Interleaving.
-; CHECK-NEXT: LV: Interleaving is not beneficial.
-; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin>
-; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
-; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT: Live-in ir<%18> = VF
-; CHECK-NEXT: Live-in ir<%18>.1 = VF * UF
-; CHECK-NEXT: Live-in ir<%n.vec> = vector-trip-count
-; CHECK-NEXT: Live-in ir<%0> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body.preheader>:
-; CHECK-NEXT: IR %0 = zext i32 %n to i64
-; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<vector.scevcheck>:
-; CHECK-NEXT: IR %3 = add nsw i64 %0, -1
-; CHECK-NEXT: IR %4 = add i32 %n, -1
-; CHECK-NEXT: IR %5 = trunc i64 %3 to i32
-; CHECK-NEXT: IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
-; CHECK-NEXT: IR %mul.result = extractvalue { i32, i1 } %mul, 0
-; CHECK-NEXT: IR %mul.overflow = extractvalue { i32, i1 } %mul, 1
-; CHECK-NEXT: IR %6 = sub i32 %4, %mul.result
-; CHECK-NEXT: IR %7 = icmp ugt i32 %6, %4
-; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow
-; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295
-; CHECK-NEXT: IR %10 = or i1 %8, %9
-; CHECK-NEXT: EMIT branch-on-cond ir<%10>
-; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<vector.memcheck>:
-; CHECK-NEXT: IR %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: IR %12 = mul nuw i64 %11, 4
-; CHECK-NEXT: IR %13 = mul i64 %12, 4
-; CHECK-NEXT: IR %14 = sub i64 %B1, %A2
-; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13
-; CHECK-NEXT: EMIT branch-on-cond ir<%diff.check>
-; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<vector.ph>:
-; CHECK-NEXT: IR %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: IR %16 = mul nuw i64 %15, 4
-; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16
-; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4
-; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT: Successor(s): vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<%6> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
-; CHECK-NEXT: WIDEN ir<%19> = load vp<%6>
-; CHECK-NEXT: WIDEN ir<%add9> = add ir<%19>, ir<1>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%7> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT: WIDEN store vp<%7>, ir<%add9>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
-; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec>
-; CHECK-NEXT: Successor(s): middle.block, vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%0>, ir<%n.vec>
-; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%4>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
-; CHECK-NEXT: Successor(s): ir-bb<for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
-; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from ir-bb<scalar.ph>)
-; CHECK-NEXT: IR %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: IR %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: IR %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: IR %19 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT: IR %add9 = add i32 %19, 1
-; CHECK-NEXT: IR %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: IR store i32 %add9, ptr %arrayidx3, align 4
-; CHECK-NEXT: IR %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<for.body.preheader> in BB: for.body.preheader
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: for.body.preheader: ; preds = %entry
-; CHECK-NEXT: %0 = zext i32 %n to i64
-; CHECK-NEXT: %1 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: %2 = mul nuw i64 %1, 4
-; CHECK-NEXT: %min.iters.check = icmp ult i64 %0, %2
-; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.scevcheck: ; No predecessors!
-; CHECK-NEXT: %3 = add nsw i64 %0, -1
-; CHECK-NEXT: %4 = add i32 %n, -1
-; CHECK-NEXT: %5 = trunc i64 %3 to i32
-; CHECK-NEXT: %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
-; CHECK-NEXT: %mul.result = extractvalue { i32, i1 } %mul, 0
-; CHECK-NEXT: %mul.overflow = extractvalue { i32, i1 } %mul, 1
-; CHECK-NEXT: %6 = sub i32 %4, %mul.result
-; CHECK-NEXT: %7 = icmp ugt i32 %6, %4
-; CHECK-NEXT: %8 = or i1 %7, %mul.overflow
-; CHECK-NEXT: %9 = icmp ugt i64 %3, 4294967295
-; CHECK-NEXT: %10 = or i1 %8, %9
-; CHECK-NEXT: br i1 %10, <null operand!>, <null operand!>
-; CHECK-NEXT: LV: draw edge from for.body.preheader
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.memcheck: ; No predecessors!
-; CHECK-NEXT: %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: %12 = mul nuw i64 %11, 4
-; CHECK-NEXT: %13 = mul i64 %12, 4
-; CHECK-NEXT: %14 = sub i64 %B1, %A2
-; CHECK-NEXT: %diff.check = icmp ult i64 %14, %13
-; CHECK-NEXT: br i1 %diff.check, <null operand!>, <null operand!>
-; CHECK-NEXT: LV: draw edge from vector.scevcheck
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.ph: ; No predecessors!
-; CHECK-NEXT: %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: %16 = mul nuw i64 %15, 4
-; CHECK-NEXT: %n.mod.vf = urem i64 %0, %16
-; CHECK-NEXT: %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT: %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: %18 = mul nuw i64 %17, 4
-; CHECK-NEXT: %19 = sub i64 %0, %n.vec
-; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32
-; CHECK-NEXT: %20 = sub i32 %n, %.cast
-; CHECK-NEXT: br
-; CHECK-NEXT: LV: draw edge from vector.memcheck
-; CHECK-NEXT: LV: created vector.body
-; CHECK-NEXT: LV: draw edge from vector.ph
-; CHECK-NEXT: LV: vectorizing VPBB: vector.body in BB: vector.body
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph
-; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ]
-; CHECK-NEXT: %.cast3 = trunc i64 %index to i32
-; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3
-; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1
-; CHECK-NEXT: %22 = zext i32 %21 to i64
-; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22
-; CHECK-NEXT: %24 = mul i64 0, %18
-; CHECK-NEXT: %25 = sub i64 %18, 1
-; CHECK-NEXT: %26 = mul i64 -1, %25
-; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %23, i64 %24
-; CHECK-NEXT: %28 = getelementptr inbounds i32, ptr %27, i64 %26
-; CHECK-NEXT: %wide.load = load <vscale x 4 x i32>, ptr %28, align 4
-; CHECK-NEXT: %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %wide.load)
-; CHECK-NEXT: %29 = add <vscale x 4 x i32> %reverse, splat (i32 1)
-; CHECK-NEXT: %30 = getelementptr inbounds i32, ptr %A, i64 %22
-; CHECK-NEXT: %31 = mul i64 0, %18
-; CHECK-NEXT: %32 = sub i64 %18, 1
-; CHECK-NEXT: %33 = mul i64 -1, %32
-; CHECK-NEXT: %34 = getelementptr inbounds i32, ptr %30, i64 %31
-; CHECK-NEXT: %35 = getelementptr inbounds i32, ptr %34, i64 %33
-; CHECK-NEXT: %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %29)
-; CHECK-NEXT: store <vscale x 4 x i32> %reverse4, ptr %35, align 4
-; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %36, <null operand!>, label %vector.body
-; CHECK-NEXT: LV: created middle.block
-; CHECK-NEXT: LV: draw edge from vector.body
-; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: middle.block: ; preds = %vector.body
-; CHECK-NEXT: %cmp.n = icmp eq i64 %0, %n.vec
-; CHECK-NEXT: br i1 %cmp.n, <null operand!>, <null operand!>
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<for.cond.cleanup.loopexit> in BB: for.cond.cleanup.loopexit
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: for.cond.cleanup.loopexit: ; preds = %for.body
-; CHECK-NEXT: br label %for.cond.cleanup
-; CHECK-NEXT: LV: draw edge from middle.block
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<scalar.ph> in BB: scalar.ph
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: scalar.ph: ; preds = %for.body.preheader
-; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
-; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
-; CHECK-NEXT: br label %for.body
-; CHECK-NEXT: LV: draw edge from middle.block
-; CHECK-NEXT: LV: draw edge from for.body.preheader
-; CHECK-NEXT: LV: draw edge from vector.scevcheck
-; CHECK-NEXT: LV: draw edge from vector.memcheck
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<for.body> in BB: for.body
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph
-; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
-; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT: %37 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT: %add9 = add i32 %37, 1
-; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4
-; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT: LV: draw edge from scalar.ph
-; CHECK-NEXT: LV: Interleaving disabled by the pass manager
-; CHECK-NEXT: LV: Vectorizing: innermost loop.
-; CHECK-EMPTY:
+; RV64-LABEL: define void @vector_reverse_i64(
+; RV64-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+; RV64-NEXT: [[ENTRY:.*:]]
+; RV64-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; RV64-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; RV64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64: [[FOR_BODY_PREHEADER]]:
+; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
+; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64: [[VECTOR_SCEVCHECK]]:
+; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
+; RV64: [[VECTOR_MEMCHECK]]:
+; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; RV64-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
+; RV64-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]]
+; RV64-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64: [[VECTOR_PH]]:
+; RV64-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
+; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]]
+; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4
+; RV64-NEXT: [[TMP19:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-NEXT: [[TMP20:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64: [[VECTOR_BODY]]:
+; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]]
+; RV64-NEXT: [[TMP21:%.*]] = add nsw i32 [[OFFSET_IDX]], -1
+; RV64-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64
+; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP22]]
+; RV64-NEXT: [[TMP24:%.*]] = mul i64 0, [[TMP18]]
+; RV64-NEXT: [[TMP25:%.*]] = sub i64 [[TMP18]], 1
+; RV64-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP25]]
+; RV64-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP24]]
+; RV64-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP26]]
+; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP28]], align 4
+; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-NEXT: [[TMP29:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
+; RV64-NEXT: [[TMP31:%.*]] = mul i64 0, [[TMP18]]
+; RV64-NEXT: [[TMP32:%.*]] = sub i64 [[TMP18]], 1
+; RV64-NEXT: [[TMP33:%.*]] = mul i64 -1, [[TMP32]]
+; RV64-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i64 [[TMP31]]
+; RV64-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i64 [[TMP33]]
+; RV64-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP29]])
+; RV64-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP35]], align 4
+; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
+; RV64-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV64: [[MIDDLE_BLOCK]]:
+; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64: [[SCALAR_PH]]:
+; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; RV64-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV64-NEXT: br label %[[FOR_BODY:.*]]
+; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-NEXT: br label %[[FOR_COND_CLEANUP]]
+; RV64: [[FOR_COND_CLEANUP]]:
+; RV64-NEXT: ret void
+; RV64: [[FOR_BODY]]:
+;
+; RV32-LABEL: define void @vector_reverse_i64(
+; RV32-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+; RV32-NEXT: [[ENTRY:.*:]]
+; RV32-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i32
+; RV32-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i32
+; RV32-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV32: [[FOR_BODY_PREHEADER]]:
+; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
+; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; RV32: [[VECTOR_MEMCHECK]]:
+; RV32-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
+; RV32-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; RV32-NEXT: [[TMP6:%.*]] = sub i32 [[B1]], [[A2]]
+; RV32-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP6]], [[TMP5]]
+; RV32-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV32: [[VECTOR_PH]]:
+; RV32-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP8]]
+; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV32-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; RV32-NEXT: [[TMP11:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV32-NEXT: [[TMP12:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV32-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV32: [[VECTOR_BODY]]:
+; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32
+; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]]
+; RV32-NEXT: [[TMP13:%.*]] = add nsw i32 [[OFFSET_IDX]], -1
+; RV32-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
+; RV32-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP14]]
+; RV32-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32
+; RV32-NEXT: [[TMP17:%.*]] = mul i32 0, [[TMP16]]
+; RV32-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], 1
+; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP18]]
+; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 [[TMP17]]
+; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]]
+; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
+; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV32-NEXT: [[TMP22:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
+; RV32-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP10]] to i32
+; RV32-NEXT: [[TMP25:%.*]] = mul i32 0, [[TMP24]]
+; RV32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], 1
+; RV32-NEXT: [[TMP27:%.*]] = mul i32 -1, [[TMP26]]
+; RV32-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 [[TMP25]]
+; RV32-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 [[TMP27]]
+; RV32-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP22]])
+; RV32-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP29]], align 4
+; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; RV32-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV32: [[MIDDLE_BLOCK]]:
+; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV32: [[SCALAR_PH]]:
+; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; RV32-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV32-NEXT: br label %[[FOR_BODY:.*]]
+; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV32-NEXT: br label %[[FOR_COND_CLEANUP]]
+; RV32: [[FOR_COND_CLEANUP]]:
+; RV32-NEXT: ret void
+; RV32: [[FOR_BODY]]:
+;
+; RV64-UF2-LABEL: define void @vector_reverse_i64(
+; RV64-UF2-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+; RV64-UF2-NEXT: [[ENTRY:.*:]]
+; RV64-UF2-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; RV64-UF2-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; RV64-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-UF2-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64-UF2: [[FOR_BODY_PREHEADER]]:
+; RV64-UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
+; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64-UF2: [[VECTOR_SCEVCHECK]]:
+; RV64-UF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-UF2-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-UF2-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-UF2-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-UF2-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-UF2-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-UF2-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-UF2-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-UF2-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
+; RV64-UF2: [[VECTOR_MEMCHECK]]:
+; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; RV64-UF2-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
+; RV64-UF2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]]
+; RV64-UF2-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64-UF2: [[VECTOR_PH]]:
+; RV64-UF2-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
+; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]]
+; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-UF2-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4
+; RV64-UF2-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; RV64-UF2-NEXT: [[TMP20:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64-UF2: [[VECTOR_BODY]]:
+; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]]
+; RV64-UF2-NEXT: [[TMP22:%.*]] = add nsw i32 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64
+; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP23]]
+; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 0, [[TMP18]]
+; RV64-UF2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP26]]
+; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP25]]
+; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
+; RV64-UF2-NEXT: [[TMP30:%.*]] = mul i64 -1, [[TMP18]]
+; RV64-UF2-NEXT: [[TMP31:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT: [[TMP32:%.*]] = mul i64 -1, [[TMP31]]
+; RV64-UF2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP30]]
+; RV64-UF2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i64 [[TMP32]]
+; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP29]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-UF2-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i32>, ptr [[TMP34]], align 4
+; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD4]])
+; RV64-UF2-NEXT: [[TMP35:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP36:%.*]] = add <vscale x 4 x i32> [[REVERSE5]], splat (i32 1)
+; RV64-UF2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP23]]
+; RV64-UF2-NEXT: [[TMP38:%.*]] = mul i64 0, [[TMP18]]
+; RV64-UF2-NEXT: [[TMP39:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT: [[TMP40:%.*]] = mul i64 -1, [[TMP39]]
+; RV64-UF2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i64 [[TMP38]]
+; RV64-UF2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i64 [[TMP40]]
+; RV64-UF2-NEXT: [[TMP43:%.*]] = mul i64 -1, [[TMP18]]
+; RV64-UF2-NEXT: [[TMP44:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT: [[TMP45:%.*]] = mul i64 -1, [[TMP44]]
+; RV64-UF2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i64 [[TMP43]]
+; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP46]], i64 [[TMP45]]
+; RV64-UF2-NEXT: [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP35]])
+; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE6]], ptr [[TMP42]], align 4
+; RV64-UF2-NEXT: [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP36]])
+; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE7]], ptr [[TMP47]], align 4
+; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; RV64-UF2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV64-UF2: [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64-UF2: [[SCALAR_PH]]:
+; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; RV64-UF2-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
+; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]]
+; RV64-UF2: [[FOR_COND_CLEANUP]]:
+; RV64-UF2-NEXT: ret void
+; RV64-UF2: [[FOR_BODY]]:
;
entry:
%cmp7 = icmp sgt i32 %n, 0
@@ -423,390 +478,259 @@ for.body: ; preds = %for.body.preheader,
}
define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) {
-; CHECK-LABEL: 'vector_reverse_f32'
-; CHECK-NEXT: LV: Loop hints: force=enabled width=vscale x 4 interleave=0
-; CHECK-NEXT: LV: Found a loop: for.body
-; CHECK-NEXT: LV: Found an induction variable.
-; CHECK-NEXT: LV: Found an induction variable.
-; CHECK-NEXT: LV: Found FP op with unsafe algebra.
-; CHECK-NEXT: LV: Did not find one integer induction var.
-; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)!
-; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Found trip count: 0
-; CHECK-NEXT: LV: Found maximum trip count: 4294967295
-; CHECK-NEXT: LV: Scalable vectorization is available
-; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
-; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295.
-; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT: LV: Using user VF vscale x 4.
-; CHECK-NEXT: Creating VPBasicBlock for for.body
-; CHECK-NEXT: VPlan 'Plain CFG
-; CHECK-NEXT: for UF>=1' {
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body.preheader>:
-; CHECK-NEXT: IR %0 = zext i32 %n to i64
-; CHECK-NEXT: Successor(s): for.body
-; CHECK-EMPTY:
-; CHECK-NEXT: for.body:
-; CHECK-NEXT: WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT: WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT: EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1>
-; CHECK-NEXT: EMIT-SCALAR ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom>
-; CHECK-NEXT: EMIT ir<%1> = load ir<%arrayidx>
-; CHECK-NEXT: EMIT ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
-; CHECK-NEXT: EMIT ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom>
-; CHECK-NEXT: EMIT store ir<%conv1>, ir<%arrayidx3>
-; CHECK-NEXT: EMIT ir<%cmp> = icmp ir<%indvars.iv>, ir<1>
-; CHECK-NEXT: EMIT ir<%indvars.iv.next> = add ir<%indvars.iv>, ir<-1>
-; CHECK-NEXT: EMIT branch-on-cond ir<%cmp>
-; CHECK-NEXT: Successor(s): for.body, ir-bb<for.cond.cleanup.loopexit>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT: Live-in vp<%0> = VF
-; CHECK-NEXT: Live-in vp<%1> = VF * UF
-; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
-; CHECK-NEXT: vp<%3> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body.preheader>:
-; CHECK-NEXT: IR %0 = zext i32 %n to i64
-; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64)
-; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%0> + vp<%2> * ir<-1>
-; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%2> * ir<-1>
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1>
-; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0>
-; CHECK-NEXT: WIDEN ir<%1> = load vp<%9>
-; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0>
-; CHECK-NEXT: WIDEN store vp<%10>, ir<%conv1>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
-; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
-; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%5>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT: Successor(s): ir-bb<for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph)
-; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from scalar.ph)
-; CHECK-NEXT: IR %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: IR %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: IR %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: IR %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT: IR %conv1 = fadd float %1, 1.000000e+00
-; CHECK-NEXT: IR %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: IR store float %conv1, ptr %arrayidx3, align 4
-; CHECK-NEXT: IR %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT: LV(REG): Calculating max register usage:
-; CHECK-NEXT: LV(REG): At #0 Interval # 0
-; CHECK-NEXT: LV(REG): At #1 Interval # 1
-; CHECK-NEXT: LV(REG): At #2 Interval # 2
-; CHECK-NEXT: LV(REG): At #3 Interval # 2
-; CHECK-NEXT: LV(REG): At #4 Interval # 2
-; CHECK-NEXT: LV(REG): At #5 Interval # 2
-; CHECK-NEXT: LV(REG): At #6 Interval # 3
-; CHECK-NEXT: LV(REG): At #7 Interval # 3
-; CHECK-NEXT: LV(REG): At #8 Interval # 3
-; CHECK-NEXT: LV(REG): At #9 Interval # 3
-; CHECK-NEXT: LV(REG): At #10 Interval # 3
-; CHECK-NEXT: LV(REG): At #11 Interval # 3
-; CHECK-NEXT: LV(REG): At #12 Interval # 2
-; CHECK-NEXT: LV(REG): At #13 Interval # 2
-; CHECK-NEXT: LV(REG): VF = vscale x 4
-; CHECK-NEXT: LV(REG): Found max usage: 2 item
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers
-; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
-; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
-; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
-; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop cost is 26
-; CHECK-NEXT: LV: IC is 1
-; CHECK-NEXT: LV: VF is vscale x 4
-; CHECK-NEXT: LV: Not Interleaving.
-; CHECK-NEXT: LV: Interleaving is not beneficial.
-; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin>
-; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
-; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT: Live-in ir<%18> = VF
-; CHECK-NEXT: Live-in ir<%18>.1 = VF * UF
-; CHECK-NEXT: Live-in ir<%n.vec> = vector-trip-count
-; CHECK-NEXT: Live-in ir<%0> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body.preheader>:
-; CHECK-NEXT: IR %0 = zext i32 %n to i64
-; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<vector.scevcheck>:
-; CHECK-NEXT: IR %3 = add nsw i64 %0, -1
-; CHECK-NEXT: IR %4 = add i32 %n, -1
-; CHECK-NEXT: IR %5 = trunc i64 %3 to i32
-; CHECK-NEXT: IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
-; CHECK-NEXT: IR %mul.result = extractvalue { i32, i1 } %mul, 0
-; CHECK-NEXT: IR %mul.overflow = extractvalue { i32, i1 } %mul, 1
-; CHECK-NEXT: IR %6 = sub i32 %4, %mul.result
-; CHECK-NEXT: IR %7 = icmp ugt i32 %6, %4
-; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow
-; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295
-; CHECK-NEXT: IR %10 = or i1 %8, %9
-; CHECK-NEXT: EMIT branch-on-cond ir<%10>
-; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<vector.memcheck>:
-; CHECK-NEXT: IR %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: IR %12 = mul nuw i64 %11, 4
-; CHECK-NEXT: IR %13 = mul i64 %12, 4
-; CHECK-NEXT: IR %14 = sub i64 %B1, %A2
-; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13
-; CHECK-NEXT: EMIT branch-on-cond ir<%diff.check>
-; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<vector.ph>:
-; CHECK-NEXT: IR %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: IR %16 = mul nuw i64 %15, 4
-; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16
-; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4
-; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT: Successor(s): vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<%6> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
-; CHECK-NEXT: WIDEN ir<%19> = load vp<%6>
-; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<%7> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT: WIDEN store vp<%7>, ir<%conv1>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
-; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec>
-; CHECK-NEXT: Successor(s): middle.block, vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%0>, ir<%n.vec>
-; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%4>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
-; CHECK-NEXT: Successor(s): ir-bb<for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
-; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from ir-bb<scalar.ph>)
-; CHECK-NEXT: IR %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: IR %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: IR %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: IR %19 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT: IR %conv1 = fadd float %19, 1.000000e+00
-; CHECK-NEXT: IR %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: IR store float %conv1, ptr %arrayidx3, align 4
-; CHECK-NEXT: IR %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<for.body.preheader> in BB: for.body.preheader
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: for.body.preheader: ; preds = %entry
-; CHECK-NEXT: %0 = zext i32 %n to i64
-; CHECK-NEXT: %1 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: %2 = mul nuw i64 %1, 4
-; CHECK-NEXT: %min.iters.check = icmp ult i64 %0, %2
-; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.scevcheck: ; No predecessors!
-; CHECK-NEXT: %3 = add nsw i64 %0, -1
-; CHECK-NEXT: %4 = add i32 %n, -1
-; CHECK-NEXT: %5 = trunc i64 %3 to i32
-; CHECK-NEXT: %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
-; CHECK-NEXT: %mul.result = extractvalue { i32, i1 } %mul, 0
-; CHECK-NEXT: %mul.overflow = extractvalue { i32, i1 } %mul, 1
-; CHECK-NEXT: %6 = sub i32 %4, %mul.result
-; CHECK-NEXT: %7 = icmp ugt i32 %6, %4
-; CHECK-NEXT: %8 = or i1 %7, %mul.overflow
-; CHECK-NEXT: %9 = icmp ugt i64 %3, 4294967295
-; CHECK-NEXT: %10 = or i1 %8, %9
-; CHECK-NEXT: br i1 %10, <null operand!>, <null operand!>
-; CHECK-NEXT: LV: draw edge from for.body.preheader
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.memcheck: ; No predecessors!
-; CHECK-NEXT: %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: %12 = mul nuw i64 %11, 4
-; CHECK-NEXT: %13 = mul i64 %12, 4
-; CHECK-NEXT: %14 = sub i64 %B1, %A2
-; CHECK-NEXT: %diff.check = icmp ult i64 %14, %13
-; CHECK-NEXT: br i1 %diff.check, <null operand!>, <null operand!>
-; CHECK-NEXT: LV: draw edge from vector.scevcheck
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.ph: ; No predecessors!
-; CHECK-NEXT: %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: %16 = mul nuw i64 %15, 4
-; CHECK-NEXT: %n.mod.vf = urem i64 %0, %16
-; CHECK-NEXT: %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT: %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: %18 = mul nuw i64 %17, 4
-; CHECK-NEXT: %19 = sub i64 %0, %n.vec
-; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32
-; CHECK-NEXT: %20 = sub i32 %n, %.cast
-; CHECK-NEXT: br
-; CHECK-NEXT: LV: draw edge from vector.memcheck
-; CHECK-NEXT: LV: created vector.body
-; CHECK-NEXT: LV: draw edge from vector.ph
-; CHECK-NEXT: LV: vectorizing VPBB: vector.body in BB: vector.body
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph
-; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ]
-; CHECK-NEXT: %.cast3 = trunc i64 %index to i32
-; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3
-; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1
-; CHECK-NEXT: %22 = zext i32 %21 to i64
-; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22
-; CHECK-NEXT: %24 = mul i64 0, %18
-; CHECK-NEXT: %25 = sub i64 %18, 1
-; CHECK-NEXT: %26 = mul i64 -1, %25
-; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %23, i64 %24
-; CHECK-NEXT: %28 = getelementptr inbounds float, ptr %27, i64 %26
-; CHECK-NEXT: %wide.load = load <vscale x 4 x float>, ptr %28, align 4
-; CHECK-NEXT: %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %wide.load)
-; CHECK-NEXT: %29 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00)
-; CHECK-NEXT: %30 = getelementptr inbounds float, ptr %A, i64 %22
-; CHECK-NEXT: %31 = mul i64 0, %18
-; CHECK-NEXT: %32 = sub i64 %18, 1
-; CHECK-NEXT: %33 = mul i64 -1, %32
-; CHECK-NEXT: %34 = getelementptr inbounds float, ptr %30, i64 %31
-; CHECK-NEXT: %35 = getelementptr inbounds float, ptr %34, i64 %33
-; CHECK-NEXT: %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %29)
-; CHECK-NEXT: store <vscale x 4 x float> %reverse4, ptr %35, align 4
-; CHECK-NEXT: %index.next = add nuw i64 %index, %18
-; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT: br i1 %36, <null operand!>, label %vector.body
-; CHECK-NEXT: LV: created middle.block
-; CHECK-NEXT: LV: draw edge from vector.body
-; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: middle.block: ; preds = %vector.body
-; CHECK-NEXT: %cmp.n = icmp eq i64 %0, %n.vec
-; CHECK-NEXT: br i1 %cmp.n, <null operand!>, <null operand!>
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<for.cond.cleanup.loopexit> in BB: for.cond.cleanup.loopexit
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: for.cond.cleanup.loopexit: ; preds = %for.body
-; CHECK-NEXT: br label %for.cond.cleanup
-; CHECK-NEXT: LV: draw edge from middle.block
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<scalar.ph> in BB: scalar.ph
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: scalar.ph: ; preds = %for.body.preheader
-; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
-; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
-; CHECK-NEXT: br label %for.body
-; CHECK-NEXT: LV: draw edge from middle.block
-; CHECK-NEXT: LV: draw edge from for.body.preheader
-; CHECK-NEXT: LV: draw edge from vector.scevcheck
-; CHECK-NEXT: LV: draw edge from vector.memcheck
-; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<for.body> in BB: for.body
-; CHECK-NEXT: LV: filled BB:
-; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph
-; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
-; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT: %37 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT: %conv1 = fadd float %37, 1.000000e+00
-; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4
-; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT: LV: draw edge from scalar.ph
-; CHECK-NEXT: LV: Interleaving disabled by the pass manager
-; CHECK-NEXT: LV: Vectorizing: innermost loop.
+; RV64-LABEL: define void @vector_reverse_f32(
+; RV64-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+; RV64-NEXT: [[ENTRY:.*:]]
+; RV64-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; RV64-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; RV64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64: [[FOR_BODY_PREHEADER]]:
+; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
+; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64: [[VECTOR_SCEVCHECK]]:
+; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
+; RV64: [[VECTOR_MEMCHECK]]:
+; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; RV64-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
+; RV64-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]]
+; RV64-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64: [[VECTOR_PH]]:
+; RV64-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
+; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]]
+; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4
+; RV64-NEXT: [[TMP19:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-NEXT: [[TMP20:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64: [[VECTOR_BODY]]:
+; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]]
+; RV64-NEXT: [[TMP21:%.*]] = add nsw i32 [[OFFSET_IDX]], -1
+; RV64-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64
+; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP22]]
+; RV64-NEXT: [[TMP24:%.*]] = mul i64 0, [[TMP18]]
+; RV64-NEXT: [[TMP25:%.*]] = sub i64 [[TMP18]], 1
+; RV64-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP25]]
+; RV64-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP24]]
+; RV64-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP26]]
+; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP28]], align 4
+; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-NEXT: [[TMP29:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]]
+; RV64-NEXT: [[TMP31:%.*]] = mul i64 0, [[TMP18]]
+; RV64-NEXT: [[TMP32:%.*]] = sub i64 [[TMP18]], 1
+; RV64-NEXT: [[TMP33:%.*]] = mul i64 -1, [[TMP32]]
+; RV64-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP30]], i64 [[TMP31]]
+; RV64-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP33]]
+; RV64-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP29]])
+; RV64-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP35]], align 4
+; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
+; RV64-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; RV64: [[MIDDLE_BLOCK]]:
+; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64: [[SCALAR_PH]]:
+; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; RV64-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV64-NEXT: br label %[[FOR_BODY:.*]]
+; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-NEXT: br label %[[FOR_COND_CLEANUP]]
+; RV64: [[FOR_COND_CLEANUP]]:
+; RV64-NEXT: ret void
+; RV64: [[FOR_BODY]]:
+;
+; RV32-LABEL: define void @vector_reverse_f32(
+; RV32-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+; RV32-NEXT: [[ENTRY:.*:]]
+; RV32-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i32
+; RV32-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i32
+; RV32-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV32: [[FOR_BODY_PREHEADER]]:
+; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
+; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; RV32: [[VECTOR_MEMCHECK]]:
+; RV32-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
+; RV32-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; RV32-NEXT: [[TMP6:%.*]] = sub i32 [[B1]], [[A2]]
+; RV32-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP6]], [[TMP5]]
+; RV32-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV32: [[VECTOR_PH]]:
+; RV32-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP8]]
+; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV32-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; RV32-NEXT: [[TMP11:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV32-NEXT: [[TMP12:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV32-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV32: [[VECTOR_BODY]]:
+; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32
+; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]]
+; RV32-NEXT: [[TMP13:%.*]] = add nsw i32 [[OFFSET_IDX]], -1
+; RV32-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
+; RV32-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]]
+; RV32-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32
+; RV32-NEXT: [[TMP17:%.*]] = mul i32 0, [[TMP16]]
+; RV32-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], 1
+; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP18]]
+; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 [[TMP17]]
+; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]]
+; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP21]], align 4
+; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV32-NEXT: [[TMP22:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
+; RV32-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP10]] to i32
+; RV32-NEXT: [[TMP25:%.*]] = mul i32 0, [[TMP24]]
+; RV32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], 1
+; RV32-NEXT: [[TMP27:%.*]] = mul i32 -1, [[TMP26]]
+; RV32-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP25]]
+; RV32-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i32 [[TMP27]]
+; RV32-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP22]])
+; RV32-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP29]], align 4
+; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; RV32-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; RV32: [[MIDDLE_BLOCK]]:
+; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV32: [[SCALAR_PH]]:
+; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; RV32-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV32-NEXT: br label %[[FOR_BODY:.*]]
+; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV32-NEXT: br label %[[FOR_COND_CLEANUP]]
+; RV32: [[FOR_COND_CLEANUP]]:
+; RV32-NEXT: ret void
+; RV32: [[FOR_BODY]]:
+;
+; RV64-UF2-LABEL: define void @vector_reverse_f32(
+; RV64-UF2-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+; RV64-UF2-NEXT: [[ENTRY:.*:]]
+; RV64-UF2-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; RV64-UF2-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; RV64-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-UF2-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64-UF2: [[FOR_BODY_PREHEADER]]:
+; RV64-UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
+; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64-UF2: [[VECTOR_SCEVCHECK]]:
+; RV64-UF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-UF2-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-UF2-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-UF2-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-UF2-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-UF2-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-UF2-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-UF2-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-UF2-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
+; RV64-UF2: [[VECTOR_MEMCHECK]]:
+; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; RV64-UF2-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
+; RV64-UF2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]]
+; RV64-UF2-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64-UF2: [[VECTOR_PH]]:
+; RV64-UF2-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
+; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]]
+; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-UF2-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4
+; RV64-UF2-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; RV64-UF2-NEXT: [[TMP20:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64-UF2: [[VECTOR_BODY]]:
+; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]]
+; RV64-UF2-NEXT: [[TMP22:%.*]] = add nsw i32 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64
+; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP23]]
+; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 0, [[TMP18]]
+; RV64-UF2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP26]]
+; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP25]]
+; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
+; RV64-UF2-NEXT: [[TMP30:%.*]] = mul i64 -1, [[TMP18]]
+; RV64-UF2-NEXT: [[TMP31:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT: [[TMP32:%.*]] = mul i64 -1, [[TMP31]]
+; RV64-UF2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP30]]
+; RV64-UF2-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP33]], i64 [[TMP32]]
+; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP29]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-UF2-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP34]], align 4
+; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD4]])
+; RV64-UF2-NEXT: [[TMP35:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP36:%.*]] = fadd <vscale x 4 x float> [[REVERSE5]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]]
+; RV64-UF2-NEXT: [[TMP38:%.*]] = mul i64 0, [[TMP18]]
+; RV64-UF2-NEXT: [[TMP39:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT: [[TMP40:%.*]] = mul i64 -1, [[TMP39]]
+; RV64-UF2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP37]], i64 [[TMP38]]
+; RV64-UF2-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP40]]
+; RV64-UF2-NEXT: [[TMP43:%.*]] = mul i64 -1, [[TMP18]]
+; RV64-UF2-NEXT: [[TMP44:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT: [[TMP45:%.*]] = mul i64 -1, [[TMP44]]
+; RV64-UF2-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP37]], i64 [[TMP43]]
+; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP46]], i64 [[TMP45]]
+; RV64-UF2-NEXT: [[REVERSE6:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP35]])
+; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE6]], ptr [[TMP42]], align 4
+; RV64-UF2-NEXT: [[REVERSE7:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP36]])
+; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE7]], ptr [[TMP47]], align 4
+; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; RV64-UF2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; RV64-UF2: [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64-UF2: [[SCALAR_PH]]:
+; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; RV64-UF2-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
+; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]]
+; RV64-UF2: [[FOR_COND_CLEANUP]]:
+; RV64-UF2-NEXT: ret void
+; RV64-UF2: [[FOR_BODY]]:
;
entry:
%cmp7 = icmp sgt i32 %n, 0
@@ -834,8 +758,397 @@ for.body: ; preds = %for.body.preheader,
br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
}
-!0 = distinct !{!0, !1, !2, !3, !4}
-!1 = !{!"llvm.loop.mustprogress"}
-!2 = !{!"llvm.loop.vectorize.width", i32 4}
-!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
-!4 = !{!"llvm.loop.vectorize.enable", i1 true}
+define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) {
+; RV64-LABEL: define void @vector_reverse_f32_simplify(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-NEXT: [[ENTRY:.*]]:
+; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64: [[VECTOR_PH]]:
+; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
+; RV64-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
+; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV64-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64: [[VECTOR_BODY]]:
+; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
+; RV64-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT: [[TMP10:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP10]]
+; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP9]]
+; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]]
+; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
+; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-NEXT: [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT: [[TMP17:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP17]]
+; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]]
+; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP18]]
+; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]])
+; RV64-NEXT: store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP20]], align 4
+; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV64-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV64: [[MIDDLE_BLOCK]]:
+; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
+; RV64-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64: [[SCALAR_PH]]:
+; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-NEXT: br label %[[FOR_BODY:.*]]
+; RV64: [[FOR_BODY]]:
+;
+; RV32-LABEL: define void @vector_reverse_f32_simplify(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV32-NEXT: [[ENTRY:.*]]:
+; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32: [[VECTOR_PH]]:
+; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
+; RV32-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
+; RV32-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV32-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
+; RV32-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV32: [[VECTOR_BODY]]:
+; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
+; RV32-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT: [[TMP10:%.*]] = mul i32 0, [[TMP9]]
+; RV32-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], 1
+; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP11]]
+; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 [[TMP10]]
+; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]]
+; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
+; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV32-NEXT: [[TMP15:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
+; RV32-NEXT: [[TMP19:%.*]] = sub i32 [[TMP17]], 1
+; RV32-NEXT: [[TMP20:%.*]] = mul i32 -1, [[TMP19]]
+; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]]
+; RV32-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 [[TMP20]]
+; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]])
+; RV32-NEXT: store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP22]], align 4
+; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV32-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV32: [[MIDDLE_BLOCK]]:
+; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
+; RV32-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV32: [[SCALAR_PH]]:
+; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV32-NEXT: br label %[[FOR_BODY:.*]]
+; RV32: [[FOR_BODY]]:
+;
+; RV64-UF2-LABEL: define void @vector_reverse_f32_simplify(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-UF2-NEXT: [[ENTRY:.*]]:
+; RV64-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64-UF2: [[VECTOR_PH]]:
+; RV64-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
+; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
+; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64-UF2: [[VECTOR_BODY]]:
+; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP11:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP11]]
+; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]]
+; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]]
+; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP16]]
+; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP15]]
+; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]]
+; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
+; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP19]], align 4
+; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1]])
+; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP21:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP24:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 -1, [[TMP24]]
+; RV64-UF2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP23]]
+; RV64-UF2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP25]]
+; RV64-UF2-NEXT: [[TMP28:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT: [[TMP29:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT: [[TMP30:%.*]] = mul i64 -1, [[TMP29]]
+; RV64-UF2-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]]
+; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP31]], i64 [[TMP30]]
+; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP20]])
+; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP27]], align 4
+; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP21]])
+; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP32]], align 4
+; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV64-UF2-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV64-UF2: [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64-UF2: [[SCALAR_PH]]:
+; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
+; RV64-UF2: [[FOR_BODY]]:
+;
+entry:
+ br label %for.body
+
+for.body:
+ %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
+ %iv.next = add nsw i64 %dec.iv, -1
+ %arrayidx.b = getelementptr inbounds float, ptr %B, i64 %iv.next
+ %0 = load float, ptr %arrayidx.b, align 4
+ %fadd = fadd float %0, 1.000000e+00
+ %arrayidx.a = getelementptr inbounds float, ptr %A, i64 %iv.next
+ store float %fadd, ptr %arrayidx.a, align 4
+ %cmp = icmp ugt i64 %dec.iv, 1
+ br i1 %cmp, label %for.body, label %exit, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
+; RV64-LABEL: define void @vector_reverse_irregular_type(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-NEXT: [[ENTRY:.*]]:
+; RV64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64: [[VECTOR_PH]]:
+; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64: [[VECTOR_BODY]]:
+; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; RV64-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
+; RV64-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
+; RV64-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
+; RV64-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
+; RV64-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
+; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
+; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP4]]
+; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
+; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
+; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
+; RV64-NEXT: [[TMP12:%.*]] = load i7, ptr [[TMP8]], align 1
+; RV64-NEXT: [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
+; RV64-NEXT: [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
+; RV64-NEXT: [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
+; RV64-NEXT: [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP12]], i32 0
+; RV64-NEXT: [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
+; RV64-NEXT: [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
+; RV64-NEXT: [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
+; RV64-NEXT: [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV64-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP4]]
+; RV64-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
+; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
+; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
+; RV64-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV64-NEXT: store i7 [[TMP25]], ptr [[TMP21]], align 1
+; RV64-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV64-NEXT: store i7 [[TMP26]], ptr [[TMP22]], align 1
+; RV64-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV64-NEXT: store i7 [[TMP27]], ptr [[TMP23]], align 1
+; RV64-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
+; RV64-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1
+; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; RV64-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
+; RV64-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RV64: [[MIDDLE_BLOCK]]:
+; RV64-NEXT: br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64: [[SCALAR_PH]]:
+; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-NEXT: br label %[[FOR_BODY:.*]]
+; RV64: [[FOR_BODY]]:
+;
+; RV32-LABEL: define void @vector_reverse_irregular_type(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV32-NEXT: [[ENTRY:.*]]:
+; RV32-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32: [[VECTOR_PH]]:
+; RV32-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV32: [[VECTOR_BODY]]:
+; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV32-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; RV32-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
+; RV32-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
+; RV32-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
+; RV32-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP0]], -1
+; RV32-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
+; RV32-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
+; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
+; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP4]]
+; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
+; RV32-NEXT: [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
+; RV32-NEXT: [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
+; RV32-NEXT: [[TMP12:%.*]] = load i7, ptr [[TMP8]], align 1
+; RV32-NEXT: [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
+; RV32-NEXT: [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
+; RV32-NEXT: [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
+; RV32-NEXT: [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP12]], i32 0
+; RV32-NEXT: [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
+; RV32-NEXT: [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
+; RV32-NEXT: [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
+; RV32-NEXT: [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP4]]
+; RV32-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
+; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
+; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
+; RV32-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV32-NEXT: store i7 [[TMP25]], ptr [[TMP21]], align 1
+; RV32-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV32-NEXT: store i7 [[TMP26]], ptr [[TMP22]], align 1
+; RV32-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV32-NEXT: store i7 [[TMP27]], ptr [[TMP23]], align 1
+; RV32-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
+; RV32-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1
+; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; RV32-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
+; RV32-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RV32: [[MIDDLE_BLOCK]]:
+; RV32-NEXT: br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV32: [[SCALAR_PH]]:
+; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV32-NEXT: br label %[[FOR_BODY:.*]]
+; RV32: [[FOR_BODY]]:
+;
+; RV64-UF2-LABEL: define void @vector_reverse_irregular_type(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-UF2-NEXT: [[ENTRY:.*]]:
+; RV64-UF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64-UF2: [[VECTOR_PH]]:
+; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64-UF2: [[VECTOR_BODY]]:
+; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-UF2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; RV64-UF2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
+; RV64-UF2-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
+; RV64-UF2-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], -4
+; RV64-UF2-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], -5
+; RV64-UF2-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -6
+; RV64-UF2-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -7
+; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[TMP1]], -1
+; RV64-UF2-NEXT: [[TMP10:%.*]] = add nsw i64 [[TMP2]], -1
+; RV64-UF2-NEXT: [[TMP11:%.*]] = add nsw i64 [[TMP3]], -1
+; RV64-UF2-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP4]], -1
+; RV64-UF2-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP5]], -1
+; RV64-UF2-NEXT: [[TMP14:%.*]] = add nsw i64 [[TMP6]], -1
+; RV64-UF2-NEXT: [[TMP15:%.*]] = add nsw i64 [[TMP7]], -1
+; RV64-UF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP9]]
+; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP10]]
+; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP11]]
+; RV64-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP12]]
+; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP13]]
+; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP14]]
+; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP15]]
+; RV64-UF2-NEXT: [[TMP24:%.*]] = load i7, ptr [[TMP16]], align 1
+; RV64-UF2-NEXT: [[TMP25:%.*]] = load i7, ptr [[TMP17]], align 1
+; RV64-UF2-NEXT: [[TMP26:%.*]] = load i7, ptr [[TMP18]], align 1
+; RV64-UF2-NEXT: [[TMP27:%.*]] = load i7, ptr [[TMP19]], align 1
+; RV64-UF2-NEXT: [[TMP28:%.*]] = insertelement <4 x i7> poison, i7 [[TMP24]], i32 0
+; RV64-UF2-NEXT: [[TMP29:%.*]] = insertelement <4 x i7> [[TMP28]], i7 [[TMP25]], i32 1
+; RV64-UF2-NEXT: [[TMP30:%.*]] = insertelement <4 x i7> [[TMP29]], i7 [[TMP26]], i32 2
+; RV64-UF2-NEXT: [[TMP31:%.*]] = insertelement <4 x i7> [[TMP30]], i7 [[TMP27]], i32 3
+; RV64-UF2-NEXT: [[TMP32:%.*]] = load i7, ptr [[TMP20]], align 1
+; RV64-UF2-NEXT: [[TMP33:%.*]] = load i7, ptr [[TMP21]], align 1
+; RV64-UF2-NEXT: [[TMP34:%.*]] = load i7, ptr [[TMP22]], align 1
+; RV64-UF2-NEXT: [[TMP35:%.*]] = load i7, ptr [[TMP23]], align 1
+; RV64-UF2-NEXT: [[TMP36:%.*]] = insertelement <4 x i7> poison, i7 [[TMP32]], i32 0
+; RV64-UF2-NEXT: [[TMP37:%.*]] = insertelement <4 x i7> [[TMP36]], i7 [[TMP33]], i32 1
+; RV64-UF2-NEXT: [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2
+; RV64-UF2-NEXT: [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3
+; RV64-UF2-NEXT: [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1)
+; RV64-UF2-NEXT: [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1)
+; RV64-UF2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP8]]
+; RV64-UF2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP9]]
+; RV64-UF2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP10]]
+; RV64-UF2-NEXT: [[TMP45:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP11]]
+; RV64-UF2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP12]]
+; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]]
+; RV64-UF2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]]
+; RV64-UF2-NEXT: [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]]
+; RV64-UF2-NEXT: [[TMP50:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0
+; RV64-UF2-NEXT: store i7 [[TMP50]], ptr [[TMP42]], align 1
+; RV64-UF2-NEXT: [[TMP51:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1
+; RV64-UF2-NEXT: store i7 [[TMP51]], ptr [[TMP43]], align 1
+; RV64-UF2-NEXT: [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2
+; RV64-UF2-NEXT: store i7 [[TMP52]], ptr [[TMP44]], align 1
+; RV64-UF2-NEXT: [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3
+; RV64-UF2-NEXT: store i7 [[TMP53]], ptr [[TMP45]], align 1
+; RV64-UF2-NEXT: [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0
+; RV64-UF2-NEXT: store i7 [[TMP54]], ptr [[TMP46]], align 1
+; RV64-UF2-NEXT: [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1
+; RV64-UF2-NEXT: store i7 [[TMP55]], ptr [[TMP47]], align 1
+; RV64-UF2-NEXT: [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2
+; RV64-UF2-NEXT: store i7 [[TMP56]], ptr [[TMP48]], align 1
+; RV64-UF2-NEXT: [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3
+; RV64-UF2-NEXT: store i7 [[TMP57]], ptr [[TMP49]], align 1
+; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; RV64-UF2-NEXT: [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016
+; RV64-UF2-NEXT: br i1 [[TMP58]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RV64-UF2: [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT: br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64-UF2: [[SCALAR_PH]]:
+; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 7, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]]
+; RV64-UF2: [[FOR_BODY]]:
+;
+entry:
+ br label %for.body
+
+for.body:
+ %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
+ %iv.next = add nsw i64 %dec.iv, -1
+ %arrayidx.b = getelementptr inbounds i7, ptr %B, i64 %iv.next
+ %0 = load i7, ptr %arrayidx.b, align 1
+ %add = add i7 %0, 1
+ %arrayidx.a = getelementptr inbounds i7, ptr %A, i64 %iv.next
+ store i7 %add, ptr %arrayidx.a, align 1
+ %cmp = icmp ugt i64 %dec.iv, 1
+ br i1 %cmp, label %for.body, label %exit, !llvm.loop !4
+
+exit:
+ ret void
+}
+
+!0 = distinct !{!0, !1, !2, !3}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+!4 = distinct !{!4, !1, !3}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
index ff9c585..b046f61 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
@@ -24,12 +24,16 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP10:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -46,7 +50,7 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -87,15 +91,19 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_END:%.*]]
; CHECK: scalar.ph:
@@ -109,7 +117,7 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
; CHECK-NEXT: store i64 [[V]], ptr [[AADDR]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -146,20 +154,24 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; CHECK-NEXT: [[TMP11]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT: [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT: [[TMP11]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[VEC_PHI]], i32 [[TMP7]])
+; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP12]])
+; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP11]])
; CHECK-NEXT: br label [[FOR_END:%.*]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
@@ -175,7 +187,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]]
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]]
@@ -217,13 +229,17 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_END:%.*]]
; CHECK: scalar.ph:
@@ -235,7 +251,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
; CHECK-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -272,14 +288,18 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; CHECK-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_END:%.*]]
; CHECK: scalar.ph:
@@ -292,7 +312,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
; CHECK-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -363,15 +383,19 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1024)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
; CHECK-NEXT: [[TMP10:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_END:%.*]]
; CHECK: scalar.ph:
@@ -385,7 +409,7 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) {
; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
index b4afdd7..cd53ea0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
@@ -1,17 +1,17 @@
; REQUIRES: asserts
-; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
-; RUN: -mtriple riscv64-linux-gnu -mattr=+v,+f -S -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-tail-folding-style=data \
+; RUN: -mtriple riscv64-linux-gnu -mattr=+v,+f -S -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=DATA
; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
; RUN: -mtriple riscv64-linux-gnu -force-tail-folding-style=data-with-evl -mattr=+v,+f -S \
; RUN: -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=EVL
-; CHECK: Cost of 2 for VF 2: EMIT{{.*}} = active lane mask
-; CHECK: Cost of 4 for VF 4: EMIT{{.*}} = active lane mask
-; CHECK: Cost of 8 for VF 8: EMIT{{.*}} = active lane mask
-; CHECK: Cost of 2 for VF vscale x 1: EMIT{{.*}} = active lane mask
-; CHECK: Cost of 4 for VF vscale x 2: EMIT{{.*}} = active lane mask
-; CHECK: Cost of 8 for VF vscale x 4: EMIT{{.*}} = active lane mask
+; DATA: Cost of 2 for VF 2: EMIT{{.*}} = active lane mask
+; DATA: Cost of 4 for VF 4: EMIT{{.*}} = active lane mask
+; DATA: Cost of 8 for VF 8: EMIT{{.*}} = active lane mask
+; DATA: Cost of 2 for VF vscale x 1: EMIT{{.*}} = active lane mask
+; DATA: Cost of 4 for VF vscale x 2: EMIT{{.*}} = active lane mask
+; DATA: Cost of 8 for VF vscale x 4: EMIT{{.*}} = active lane mask
; EVL: Cost of 1 for VF vscale x 1: EMIT{{.*}} = EXPLICIT-VECTOR-LENGTH
; EVL: Cost of 1 for VF vscale x 2: EMIT{{.*}} = EXPLICIT-VECTOR-LENGTH
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index 528cec0..b56e712 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -170,15 +170,11 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[INDEX]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP13]]
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT4]], [[TMP14]]
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i32> [[VEC_IV]], i32 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP15]], i32 9)
-; CHECK-NEXT: [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> [[BROADCAST_SPLAT2]], i32 1, <vscale x 4 x i1> [[TMP11]])
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub i32 9, [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> align 1 [[BROADCAST_SPLAT2]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP6]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -199,7 +195,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
; CHECK-NEXT: [[ADD]] = add i8 [[F_039]], 1
; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[F_039]] to i32
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[CONV]], 8
-; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -298,7 +294,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[WIDE_MASKED_GATHER]], <vscale x 2 x ptr> [[BROADCAST_SPLAT6]], i32 8, <vscale x 2 x i1> [[TMP8]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -319,7 +315,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
; CHECK: [[LOOP_LATCH]]:
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[V]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -359,8 +355,9 @@ attributes #1 = { "target-features"="+64bit,+v" }
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META7:![0-9]+]], [[META2]]}
+; CHECK: [[META7]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 8baf9d9..c6955f1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -2,9 +2,6 @@
; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=0 -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=SCALABLE
; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=off -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=FIXEDLEN
; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=0 -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=TF-SCALABLE
-; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=TF-FIXEDLEN
-
-
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
target triple = "riscv64"
@@ -103,15 +100,19 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
; TF-SCALABLE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]]
; TF-SCALABLE: [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
+; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; TF-SCALABLE-NEXT: [[TMP5:%.*]] = load i64, ptr [[B]], align 8
; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i64 0
; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; TF-SCALABLE-NEXT: [[TMP10:%.*]] = zext i32 [[TMP6]] to i64
+; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
+; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
@@ -126,44 +127,10 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; TF-SCALABLE: [[FOR_END]]:
; TF-SCALABLE-NEXT: ret void
;
-; TF-FIXEDLEN-LABEL: define void @uniform_load(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN: [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN: [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = load i64, ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; TF-FIXEDLEN: [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]]
-; TF-FIXEDLEN: [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN: [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; TF-FIXEDLEN: [[FOR_END]]:
-; TF-FIXEDLEN-NEXT: ret void
-;
entry:
br label %for.body
@@ -277,22 +244,6 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; TF-SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ]
; TF-SCALABLE-NEXT: ret i64 [[V_LCSSA]]
;
-; TF-FIXEDLEN-LABEL: define i64 @uniform_load_outside_use(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN: [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]]
-; TF-FIXEDLEN: [[FOR_END]]:
-; TF-FIXEDLEN-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: ret i64 [[V_LCSSA]]
-;
entry:
br label %for.body
@@ -437,25 +388,31 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; TF-SCALABLE-NEXT: [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], splat (i64 1)
; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
-; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP4]]
-; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
-; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]]
; TF-SCALABLE: [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 1025)
+; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
+; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP11]]
+; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
+; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], splat (i64 1024)
; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt <vscale x 4 x i64> [[VEC_IND]], splat (i64 10)
; TF-SCALABLE-NEXT: [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> zeroinitializer
-; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i64> poison)
+; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.vp.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP10]], i32 [[TMP7]])
; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], <vscale x 4 x i64> zeroinitializer
; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv4i64.p0(<vscale x 4 x i64> [[PREDPHI]], ptr [[TMP13]], i32 8, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[PREDPHI]], ptr align 8 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; TF-SCALABLE-NEXT: [[TMP15:%.*]] = zext i32 [[TMP7]] to i64
+; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]]
+; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; TF-SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[SCALAR_PH]]:
@@ -474,55 +431,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
; TF-SCALABLE-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; TF-SCALABLE: [[FOR_END]]:
; TF-SCALABLE-NEXT: ret void
;
-; TF-FIXEDLEN-LABEL: define void @conditional_uniform_load(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN: [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN: [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], splat (i64 10)
-; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
-; TF-FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison)
-; TF-FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer
-; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP4]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; TF-FIXEDLEN: [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]]
-; TF-FIXEDLEN: [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN: [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; TF-FIXEDLEN-NEXT: [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
-; TF-FIXEDLEN-NEXT: br i1 [[CMP]], label %[[DO_LOAD:.*]], label %[[LATCH]]
-; TF-FIXEDLEN: [[DO_LOAD]]:
-; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT: br label %[[LATCH]]
-; TF-FIXEDLEN: [[LATCH]]:
-; TF-FIXEDLEN-NEXT: [[PHI:%.*]] = phi i64 [ 0, %[[FOR_BODY]] ], [ [[V]], %[[DO_LOAD]] ]
-; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; TF-FIXEDLEN: [[FOR_END]]:
-; TF-FIXEDLEN-NEXT: ret void
-;
entry:
br label %for.body
@@ -640,17 +552,21 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
; TF-SCALABLE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]]
; TF-SCALABLE: [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
+; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; TF-SCALABLE-NEXT: [[TMP5:%.*]] = load i64, ptr [[B]], align 1
; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i64 0
; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; TF-SCALABLE-NEXT: [[TMP10:%.*]] = zext i32 [[TMP6]] to i64
+; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
+; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[SCALAR_PH]]:
@@ -663,44 +579,10 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; TF-SCALABLE: [[FOR_END]]:
; TF-SCALABLE-NEXT: ret void
;
-; TF-FIXEDLEN-LABEL: define void @uniform_load_unaligned(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN: [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN: [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = load i64, ptr [[B]], align 1
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; TF-FIXEDLEN: [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]]
-; TF-FIXEDLEN: [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN: [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 1
-; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; TF-FIXEDLEN: [[FOR_END]]:
-; TF-FIXEDLEN-NEXT: ret void
-;
entry:
br label %for.body
@@ -813,15 +695,19 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]]
; TF-SCALABLE: [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
+; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 8
; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; TF-SCALABLE-NEXT: [[TMP9:%.*]] = zext i32 [[TMP5]] to i64
+; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[INDEX]]
+; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[SCALAR_PH]]:
@@ -834,44 +720,10 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; TF-SCALABLE: [[FOR_END]]:
; TF-SCALABLE-NEXT: ret void
;
-; TF-FIXEDLEN-LABEL: define void @uniform_store(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN: [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN: [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; TF-FIXEDLEN: [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]]
-; TF-FIXEDLEN: [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN: [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; TF-FIXEDLEN: [[FOR_END]]:
-; TF-FIXEDLEN-NEXT: ret void
-;
entry:
br label %for.body
@@ -1003,22 +855,27 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
-; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP4]]
-; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
-; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]]
; TF-SCALABLE: [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
-; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; TF-SCALABLE-NEXT: [[TMP13:%.*]] = zext i32 [[TMP9]] to i64
+; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP13]]
+; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
+; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
; TF-SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
; TF-SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT3]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT3]], ptr align 8 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; TF-SCALABLE-NEXT: [[TMP14:%.*]] = zext i32 [[TMP9]] to i64
+; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[INDEX]]
+; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[SCALAR_PH]]:
@@ -1031,71 +888,10 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; TF-SCALABLE: [[FOR_END]]:
; TF-SCALABLE-NEXT: ret void
;
-; TF-FIXEDLEN-LABEL: define void @uniform_store_of_loop_varying(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN: [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN: [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
-; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; TF-FIXEDLEN-NEXT: br i1 [[TMP0]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; TF-FIXEDLEN: [[PRED_STORE_IF]]:
-; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; TF-FIXEDLEN-NEXT: store i64 [[TMP1]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT: br label %[[PRED_STORE_CONTINUE]]
-; TF-FIXEDLEN: [[PRED_STORE_CONTINUE]]:
-; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; TF-FIXEDLEN-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
-; TF-FIXEDLEN: [[PRED_STORE_IF1]]:
-; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 1
-; TF-FIXEDLEN-NEXT: store i64 [[TMP3]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT: br label %[[PRED_STORE_CONTINUE2]]
-; TF-FIXEDLEN: [[PRED_STORE_CONTINUE2]]:
-; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
-; TF-FIXEDLEN: [[PRED_STORE_IF3]]:
-; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 2
-; TF-FIXEDLEN-NEXT: store i64 [[TMP5]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT: br label %[[PRED_STORE_CONTINUE4]]
-; TF-FIXEDLEN: [[PRED_STORE_CONTINUE4]]:
-; TF-FIXEDLEN-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; TF-FIXEDLEN-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]]
-; TF-FIXEDLEN: [[PRED_STORE_IF5]]:
-; TF-FIXEDLEN-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 3
-; TF-FIXEDLEN-NEXT: store i64 [[TMP7]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT: br label %[[PRED_STORE_CONTINUE6]]
-; TF-FIXEDLEN: [[PRED_STORE_CONTINUE6]]:
-; TF-FIXEDLEN-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; TF-FIXEDLEN: [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]]
-; TF-FIXEDLEN: [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN: [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: store i64 [[IV]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; TF-FIXEDLEN: [[FOR_END]]:
-; TF-FIXEDLEN-NEXT: ret void
-;
entry:
br label %for.body
@@ -1240,24 +1036,28 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
-; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP4]]
-; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
-; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]]
; TF-SCALABLE: [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
+; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64
+; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP11]]
+; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
+; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], splat (i64 10)
-; TF-SCALABLE-NEXT: [[TMP9:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> zeroinitializer
-; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> [[BROADCAST_SPLAT2]], i32 8, <vscale x 2 x i1> [[TMP9]])
+; TF-SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT2]], <vscale x 2 x i1> [[TMP10]], i32 [[TMP9]])
; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; TF-SCALABLE-NEXT: [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
+; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]]
+; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; TF-SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[SCALAR_PH]]:
@@ -1275,55 +1075,10 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; TF-SCALABLE: [[FOR_END]]:
; TF-SCALABLE-NEXT: ret void
;
-; TF-FIXEDLEN-LABEL: define void @conditional_uniform_store(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN: [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT1]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN: [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], splat (i64 10)
-; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP0]], <4 x i1> zeroinitializer
-; TF-FIXEDLEN-NEXT: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP1]])
-; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; TF-FIXEDLEN: [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]]
-; TF-FIXEDLEN: [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN: [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; TF-FIXEDLEN-NEXT: [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
-; TF-FIXEDLEN-NEXT: br i1 [[CMP]], label %[[DO_STORE:.*]], label %[[LATCH]]
-; TF-FIXEDLEN: [[DO_STORE]]:
-; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT: br label %[[LATCH]]
-; TF-FIXEDLEN: [[LATCH]]:
-; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; TF-FIXEDLEN: [[FOR_END]]:
-; TF-FIXEDLEN-NEXT: ret void
-;
entry:
br label %for.body
@@ -1442,15 +1197,19 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]]
; TF-SCALABLE: [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
+; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 1
; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; TF-SCALABLE-NEXT: [[TMP9:%.*]] = zext i32 [[TMP5]] to i64
+; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[INDEX]]
+; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[SCALAR_PH]]:
@@ -1463,44 +1222,10 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; TF-SCALABLE: [[FOR_END]]:
; TF-SCALABLE-NEXT: ret void
;
-; TF-FIXEDLEN-LABEL: define void @uniform_store_unaligned(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN: [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
-; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN: [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 1
-; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; TF-FIXEDLEN: [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]]
-; TF-FIXEDLEN: [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN: [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 1
-; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; TF-FIXEDLEN: [[FOR_END]]:
-; TF-FIXEDLEN-NEXT: ret void
-;
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
index fe6a693..acfcf90 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -16,48 +16,39 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
-; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; IF-EVL-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i64> [[TMP10]], splat (i64 1)
; IF-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]]
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
+; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP13]]
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; IF-EVL-NEXT: [[TMP20:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 0
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP21]], i32 4, <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
-; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 1
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP23]], i32 4, <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP24]], i32 4, <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT: [[TMP25:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP23]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
; IF-EVL-NEXT: [[TMP26:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER5]], [[WIDE_MASKED_GATHER3]]
-; IF-EVL-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; IF-EVL-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0
-; IF-EVL-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 4
-; IF-EVL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP31]]
-; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP25]], ptr [[TMP29]], i32 4, <vscale x 4 x i1> [[TMP19]])
-; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP26]], ptr [[TMP32]], i32 4, <vscale x 4 x i1> [[TMP20]])
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
-; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP26]], ptr align 4 [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; IF-EVL-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IF-EVL-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL: middle.block:
@@ -76,50 +67,36 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; IF-EVL: for.cond.cleanup:
; IF-EVL-NEXT: ret void
;
; NO-VP-LABEL: @interleave(
; NO-VP-NEXT: entry:
; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; NO-VP-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; NO-VP: vector.ph:
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP4]], 4
-; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP8]], 2
; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-VP: vector.body:
; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0
-; NO-VP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1
-; NO-VP-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
-; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[INDEX]], i32 0
-; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP11]], i32 0
-; NO-VP-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
-; NO-VP-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; NO-VP-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; NO-VP-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[INDEX]], i32 0
; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
; NO-VP-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
; NO-VP-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
; NO-VP-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
-; NO-VP-NEXT: [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP15]]
; NO-VP-NEXT: [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP18]]
; NO-VP-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
; NO-VP-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
-; NO-VP-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT: [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 4
-; NO-VP-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP26]]
-; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP20]], ptr [[TMP24]], align 4
-; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP21]], ptr [[TMP27]], align 4
-; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP21]], ptr [[TMP24]], align 4
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
; NO-VP-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NO-VP-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; NO-VP: middle.block:
@@ -163,6 +140,5 @@ for.cond.cleanup:
ret void
}
-!0 = distinct !{!0, !1, !2}
-!1 = !{!"llvm.loop.interleave.count", i32 2}
-!2 = !{!"llvm.loop.vectorize.enable", i1 true}
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll
new file mode 100644
index 0000000..d7c9ce4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll
@@ -0,0 +1,80 @@
+; This is the loop in c++ being vectorize in this file with
+;vector.reverse
+; #pragma clang loop vectorize_width(4, scalable)
+; for (int i = N-1; i >= 0; --i)
+; a[i] = b[i] + 1.0;
+
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; RUN: -debug-only=loop-vectorize -scalable-vectorization=on \
+; RUN: -disable-output < %s 2>&1 | FileCheck %s
+
+define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) {
+; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: vp<[[OTC:%.+]]> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: EMIT vp<[[OTC]]> = EXPAND SCEV (1 + (-1 * (1 umin %n))<nuw><nsw> + %n)
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: vp<[[RESUME_IV_A:%.+]]> = DERIVED-IV ir<%n> + vp<[[VTC]]> * ir<-1>
+; CHECK-NEXT: vp<[[RESUME_IV_B:%.+]]> = DERIVED-IV ir<%n> + vp<[[VTC]]> * ir<-1>
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[INDUCTION:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEX_NEXT:%.+]]>
+; CHECK-NEXT: vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[INDUCTION]]> * ir<-1>
+; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>, vp<[[VF]]>
+; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[SCALAR_STEPS]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<[[IDX_PROM:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT: CLONE ir<[[ARRAY_IDX_B:%.+]]> = getelementptr inbounds ir<[[B:%.+]]>, ir<[[IDX_PROM]]>
+; CHECK-NEXT: vp<[[VEC_END_PTR_B:%.+]]> = vector-end-pointer inbounds ir<[[ARRAY_IDX_B]]>, vp<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[VAL_B:%.+]]> = load vp<[[VEC_END_PTR_B]]>
+; CHECK-NEXT: WIDEN ir<[[ADD_RESULT:%.+]]> = add ir<[[VAL_B]]>, ir<1>
+; CHECK-NEXT: CLONE ir<[[ARRAY_IDX_A:%.+]]> = getelementptr inbounds ir<[[A:%.+]]>, ir<[[IDX_PROM]]>
+; CHECK-NEXT: vp<[[VEC_END_PTR_A:%.+]]> = vector-end-pointer inbounds ir<[[ARRAY_IDX_A]]>, vp<[[VF]]>
+; CHECK-NEXT: WIDEN store vp<[[VEC_END_PTR_A]]>, ir<[[ADD_RESULT]]>
+; CHECK-NEXT: EMIT vp<[[INDEX_NEXT]]> = add nuw vp<[[INDUCTION]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[INDEX_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[OTC]]>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.cond.cleanup>:
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[RESUME_IV_A]]>, middle.block ], [ ir<%n>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<[[RESUME_IV_B]]>, middle.block ], [ ir<%n>, ir-bb<entry> ]
+; CHECK-NEXT: Successor(s): ir-bb<for.body>
+;
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i32 [ %n, %entry ], [ %indvars.iv.next, %for.body ]
+ %i.0.in8 = phi i32 [ %n, %entry ], [ %i.0, %for.body ]
+ %i.0 = add nsw i32 %i.0.in8, -1
+ %idxprom = zext i32 %i.0 to i64
+ %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+ %1 = load i32, ptr %arrayidx, align 4
+ %add9 = add i32 %1, 1
+ %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+ store i32 %add9, ptr %arrayidx3, align 4
+ %cmp = icmp ugt i32 %indvars.iv, 1
+ %indvars.iv.next = add nsw i32 %indvars.iv, -1
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
index bb61f431d..9652351 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
@@ -22,7 +22,7 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -78,7 +78,7 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -134,7 +134,7 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -190,7 +190,7 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -246,7 +246,7 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -297,7 +297,7 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -348,7 +348,7 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -403,7 +403,7 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -458,7 +458,7 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
index 2e1bcaa..3ec48ef 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
@@ -21,7 +21,7 @@ define void @vp_sext(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -74,7 +74,7 @@ define void @vp_zext(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -125,7 +125,7 @@ define void @vp_trunc(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -176,7 +176,7 @@ define void @vp_fpext(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -227,7 +227,7 @@ define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -278,7 +278,7 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -329,7 +329,7 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -380,7 +380,7 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -431,7 +431,7 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -482,7 +482,7 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -534,7 +534,7 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) {
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[INDEX_EVL:%.+]]> = phi ir<0>, vp<[[INDEX_EVL_NEXT:%.+]]>
; IF-EVL-NEXT: ir<[[IV:%.+]]> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<[[N]]>, vp<[[INDEX_EVL]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[INDEX_EVL]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: WIDEN-GEP Inv[Var] ir<[[GEP:%.+]]> = getelementptr inbounds ir<%b>, ir<[[IV]]>
; IF-EVL-NEXT: WIDEN-CAST ir<[[PTRTOINT:%.+]]> = ptrtoint ir<[[GEP]]> to i64
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll
index 7540b77..7f29213 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll
@@ -29,7 +29,7 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<[[FOR_PHI:%.+]]> = phi ir<33>, ir<[[LD:%.+]]>
; IF-EVL-NEXT: EMIT-SCALAR vp<[[PREV_EVL:%.+]]> = phi [ vp<[[VF32]]>, vector.ph ], [ vp<[[EVL:%.+]]>, vector.body ]
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%TC>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds nuw ir<%A>, vp<[[ST]]
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index aa15a20..baf546b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -45,7 +45,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-OUTLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_SELECT:%.+]]>
; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%n>, vp<[[EVL_PHI]]>
-; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-OUTLOOP-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-OUTLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-OUTLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
; IF-EVL-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -84,7 +84,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-INLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]>
; IF-EVL-INLOOP-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%n>, vp<[[EVL_PHI]]>
-; IF-EVL-INLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-INLOOP-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-INLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-INLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
; IF-EVL-INLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
index 563e515..97a6130 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -27,7 +27,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 05a495d..86b28c3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -67,17 +67,16 @@ define void @redundant_or_1(ptr %dst, i1 %c.0, i1 %c.1) {
; CHECK: vector.ph:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C_0:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i1> poison, i1 [[C_1:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT1]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
-; CHECK-NEXT: [[TMP4:%.*]] = select i1 true, <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 2)
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP0]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[BROADCAST_SPLAT2]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
@@ -120,11 +119,11 @@ define void @redundant_or_1(ptr %dst, i1 %c.0, i1 %c.1) {
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
; CHECK: loop.header:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_LATCH]], label [[THEN_1:%.*]]
+; CHECK-NEXT: br i1 [[C_0]], label [[LOOP_LATCH]], label [[THEN_1:%.*]]
; CHECK: then.1:
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV]], 2
; CHECK-NEXT: [[OR:%.*]] = or i1 [[CMP]], true
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[OR]], i1 [[C_0]], i1 false
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[OR]], i1 [[C_1]], i1 false
; CHECK-NEXT: br i1 [[COND]], label [[THEN_2:%.*]], label [[LOOP_LATCH]]
; CHECK: then.2:
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
@@ -171,17 +170,16 @@ define void @redundant_or_2(ptr %dst, i1 %c.0, i1 %c.1) {
; CHECK: vector.ph:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C_1:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i1> poison, i1 [[C_0:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT1]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
-; CHECK-NEXT: [[TMP1:%.*]] = select i1 true, <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 2)
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP0]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[BROADCAST_SPLAT2]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
@@ -224,11 +222,11 @@ define void @redundant_or_2(ptr %dst, i1 %c.0, i1 %c.1) {
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
; CHECK: loop.header:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: br i1 [[C_0]], label [[LOOP_LATCH]], label [[THEN_1:%.*]]
+; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_LATCH]], label [[THEN_1:%.*]]
; CHECK: then.1:
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV]], 2
; CHECK-NEXT: [[OR:%.*]] = or i1 true, [[CMP]]
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[OR]], i1 [[C_1]], i1 false
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[OR]], i1 [[C_0]], i1 false
; CHECK-NEXT: br i1 [[COND]], label [[THEN_2:%.*]], label [[LOOP_LATCH]]
; CHECK: then.2:
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll
index bdd0c6f..7cc8458 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll
@@ -431,195 +431,26 @@ exit:
ret void
}
-define void @lifetime_for_ptr_first_arg_before_multiply(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) {
-; CHECK-LABEL: @lifetime_for_ptr_first_arg_before_multiply(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
-; CHECK: then:
-; CHECK-NEXT: br label [[EXIT]]
-; CHECK: exit:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
-; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
-; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
-; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
-; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
-; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
-; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
-; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
-; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
-; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
-; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
-; CHECK-NEXT: store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
-; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
-; CHECK-NEXT: store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
-; CHECK-NEXT: ret void
-;
-entry:
- %a = load <4 x double>, ptr %A, align 8
- %b = load <4 x double>, ptr %B, align 8
- br i1 %c.0, label %then, label %exit
-
-then:
- call void @llvm.lifetime.end(i64 -1, ptr %A)
- br label %exit
-
-exit:
- %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
- store <4 x double> %m, ptr %C, align 8
- ret void
-}
-
-define void @lifetime_for_both_ptr_args_before_multiply(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) {
-; CHECK-LABEL: @lifetime_for_both_ptr_args_before_multiply(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
-; CHECK: then:
-; CHECK-NEXT: br label [[EXIT]]
-; CHECK: exit:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
-; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
-; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
-; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
-; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
-; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
-; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
-; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
-; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
-; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
-; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
-; CHECK-NEXT: store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
-; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
-; CHECK-NEXT: store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
-; CHECK-NEXT: ret void
-;
-entry:
- %a = load <4 x double>, ptr %A, align 8
- %b = load <4 x double>, ptr %B, align 8
- br i1 %c.0, label %then, label %exit
-
-then:
- call void @llvm.lifetime.end(i64 -1, ptr %B)
- call void @llvm.lifetime.end(i64 -1, ptr %A)
- br label %exit
-
-exit:
- %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
- store <4 x double> %m, ptr %C, align 8
- ret void
-}
-
-define void @multiple_unrelated_lifetimes(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) {
+define void @multiple_unrelated_lifetimes(ptr noalias %C, i1 %c.0) {
; CHECK-LABEL: @multiple_unrelated_lifetimes(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ALLOC_1:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[ALLOC_2:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT: call void @init(ptr [[A]])
+; CHECK-NEXT: call void @init(ptr [[B]])
; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
; CHECK: then:
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[ALLOC_1]])
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[ALLOC_2]])
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
@@ -682,6 +513,10 @@ define void @multiple_unrelated_lifetimes(ptr noalias %A, ptr noalias %B, ptr no
entry:
%alloc.1 = alloca i32
%alloc.2 = alloca i32
+ %A = alloca <4 x double>
+ %B = alloca <4 x double>
+ call void @init(ptr %A)
+ call void @init(ptr %B)
%a = load <4 x double>, ptr %A, align 8
%b = load <4 x double>, ptr %B, align 8
br i1 %c.0, label %then, label %exit
@@ -699,106 +534,20 @@ exit:
ret void
}
-define void @lifetime_for_ptr_select_before_multiply(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0, i1 %c.1) {
-; CHECK-LABEL: @lifetime_for_ptr_select_before_multiply(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[P:%.*]] = select i1 [[C_0:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]
-; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
-; CHECK: then:
-; CHECK-NEXT: br label [[EXIT]]
-; CHECK: exit:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[P]], i64 0
-; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
-; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
-; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
-; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
-; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
-; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
-; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
-; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
-; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
-; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
-; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[C:%.*]], i64 0
-; CHECK-NEXT: store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
-; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
-; CHECK-NEXT: store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
-; CHECK-NEXT: ret void
-;
-entry:
- %P = select i1 %c.0, ptr %A, ptr %B
- %a = load <4 x double>, ptr %P, align 8
- %b = load <4 x double>, ptr %B, align 8
- br i1 %c.1, label %then, label %exit
-
-then:
- call void @llvm.lifetime.end(i64 -1, ptr %P)
- br label %exit
-
-exit:
- %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
- store <4 x double> %m, ptr %C, align 8
- ret void
-}
-
-define void @lifetimes_for_args_in_different_blocks(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+define void @lifetimes_for_args_in_different_blocks(ptr noalias %C, i1 %c.0) {
; CHECK-LABEL: @lifetimes_for_args_in_different_blocks(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32
; CHECK-NEXT: call void @init(ptr [[A]])
+; CHECK-NEXT: call void @init(ptr [[B]])
; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
; CHECK: then:
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
@@ -864,7 +613,9 @@ define void @lifetimes_for_args_in_different_blocks(ptr noalias %B, ptr noalias
;
entry:
%A = alloca <4 x double>
+ %B = alloca <4 x double>
call void @init(ptr %A)
+ call void @init(ptr %B)
br i1 %c.0, label %then, label %exit
then:
@@ -880,15 +631,17 @@ exit:
ret void
}
-define void @lifetimes_for_args_in_different_blocks2(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+define void @lifetimes_for_args_in_different_blocks2(ptr noalias %C, i1 %c.0) {
; CHECK-LABEL: @lifetimes_for_args_in_different_blocks2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32
; CHECK-NEXT: call void @init(ptr [[A]])
+; CHECK-NEXT: call void @init(ptr [[B]])
; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
; CHECK: then:
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[B:%.*]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[B]])
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
@@ -957,7 +710,9 @@ define void @lifetimes_for_args_in_different_blocks2(ptr noalias %B, ptr noalias
;
entry:
%A = alloca <4 x double>
+ %B = alloca <4 x double>
call void @init(ptr %A)
+ call void @init(ptr %B)
br i1 %c.0, label %then, label %exit
then:
@@ -973,18 +728,20 @@ exit:
ret void
}
-define void @lifetimes_for_args_load0_in_different_block(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+define void @lifetimes_for_args_load0_in_different_block(ptr noalias %C, i1 %c.0) {
; CHECK-LABEL: @lifetimes_for_args_load0_in_different_block(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32
; CHECK-NEXT: call void @init(ptr [[A]])
+; CHECK-NEXT: call void @init(ptr [[B]])
; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
; CHECK: then:
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
@@ -1048,7 +805,9 @@ define void @lifetimes_for_args_load0_in_different_block(ptr noalias %B, ptr noa
;
entry:
%A = alloca <4 x double>
+ %B = alloca <4 x double>
call void @init(ptr %A)
+ call void @init(ptr %B)
%a = load <4 x double>, ptr %A, align 8
call void @llvm.lifetime.end(i64 -1, ptr %A)
br i1 %c.0, label %then, label %exit
@@ -1064,18 +823,20 @@ exit:
ret void
}
-define void @lifetimes_for_args_load1_in_different_block(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+define void @lifetimes_for_args_load1_in_different_block(ptr noalias %C, i1 %c.0) {
; CHECK-LABEL: @lifetimes_for_args_load1_in_different_block(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32
; CHECK-NEXT: call void @init(ptr [[A]])
+; CHECK-NEXT: call void @init(ptr [[B]])
; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
; CHECK: then:
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
@@ -1139,7 +900,9 @@ define void @lifetimes_for_args_load1_in_different_block(ptr noalias %B, ptr noa
;
entry:
%A = alloca <4 x double>
+ %B = alloca <4 x double>
call void @init(ptr %A)
+ call void @init(ptr %B)
%b = load <4 x double>, ptr %B, align 8
call void @llvm.lifetime.end(i64 -1, ptr %B)
br i1 %c.0, label %then, label %exit
diff --git a/llvm/test/Transforms/Mem2Reg/alloca_addrspace.ll b/llvm/test/Transforms/Mem2Reg/alloca_addrspace.ll
index 03c86bc..87ff922 100644
--- a/llvm/test/Transforms/Mem2Reg/alloca_addrspace.ll
+++ b/llvm/test/Transforms/Mem2Reg/alloca_addrspace.ll
@@ -10,9 +10,6 @@ define amdgpu_kernel void @addressspace_alloca() {
; CHECK-NEXT: ret void
;
%alloca = alloca i8, align 8, addrspace(5)
- %cast = addrspacecast ptr addrspace(5) %alloca to ptr
- call void @llvm.lifetime.start.p0(i64 2, ptr %cast)
+ call void @llvm.lifetime.start(i64 2, ptr addrspace(5) %alloca)
ret void
}
-
-declare void @llvm.lifetime.start.p0(i64 %size, ptr nocapture %ptr)
diff --git a/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll b/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll
index e9f40b5..d4bc097 100644
--- a/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll
+++ b/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll
@@ -54,10 +54,10 @@ define void @positive_gep_assume_uses() {
;
%A = alloca {i8, i16}
%B = getelementptr {i8, i16}, ptr %A, i32 0, i32 0
- call void @llvm.lifetime.start.p0(i64 2, ptr %B)
+ call void @llvm.lifetime.start.p0(i64 2, ptr %A)
call void @llvm.assume(i1 true) ["align"(ptr %B, i64 8), "align"(ptr %B, i64 16)]
store {i8, i16} zeroinitializer, ptr %A
- call void @llvm.lifetime.end.p0(i64 2, ptr %B)
+ call void @llvm.lifetime.end.p0(i64 2, ptr %A)
call void @llvm.assume(i1 true) ["nonnull"(ptr %B), "align"(ptr %B, i64 2)]
ret void
}
diff --git a/llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll b/llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll
index 3773d41..bcc9693 100644
--- a/llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll
+++ b/llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll
@@ -17,9 +17,8 @@ define void @test2() {
; CHECK: test2
; CHECK-NOT: alloca
%A = alloca {i8, i16}
- %B = getelementptr {i8, i16}, ptr %A, i32 0, i32 0
- call void @llvm.lifetime.start.p0(i64 2, ptr %B)
+ call void @llvm.lifetime.start.p0(i64 2, ptr %A)
store {i8, i16} zeroinitializer, ptr %A
- call void @llvm.lifetime.end.p0(i64 2, ptr %B)
+ call void @llvm.lifetime.end.p0(i64 2, ptr %A)
ret void
}
diff --git a/llvm/test/Transforms/MemCpyOpt/lifetime.ll b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
index 6158874..e9fc06b 100644
--- a/llvm/test/Transforms/MemCpyOpt/lifetime.ll
+++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
@@ -116,22 +116,3 @@ define i32 @call_slot_clobber_before_lifetime_start() {
%v = load i32, ptr %dst
ret i32 %v
}
-
-define void @call_slot_lifetime_bitcast(ptr %ptr) {
-; CHECK-LABEL: @call_slot_lifetime_bitcast(
-; CHECK-NEXT: [[TMP1:%.*]] = alloca i32, align 4
-; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP2]], ptr align 4 [[PTR:%.*]], i64 4, i1 false)
-; CHECK-NEXT: [[TMP1_CAST:%.*]] = bitcast ptr [[TMP1]] to ptr
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TMP1_CAST]])
-; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP1]], ptr align 4 [[PTR]], i64 4, i1 false)
-; CHECK-NEXT: ret void
-;
- %tmp1 = alloca i32
- %tmp2 = alloca i32
- call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp2, ptr align 4 %ptr, i64 4, i1 false)
- %tmp1.cast = bitcast ptr %tmp1 to ptr
- call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %tmp1.cast)
- call void @llvm.memcpy.p0.p0.i64(ptr align 4 %tmp1.cast, ptr align 4 %tmp2, i64 4, i1 false)
- ret void
-}
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll
index 2f1ce37..816e103 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll
@@ -26,35 +26,41 @@ define i32 @test1(ptr nocapture %foobie) nounwind noinline ssp uwtable {
}
; Check that the memcpy is removed.
-define void @test2(ptr sret(i8) noalias nocapture %out, ptr %in) nounwind noinline ssp uwtable {
+define void @test2(ptr sret(i8) noalias nocapture %out) nounwind noinline ssp uwtable {
; CHECK-LABEL: @test2(
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[IN:%.*]])
+; CHECK-NEXT: [[IN:%.*]] = alloca i64, align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[IN]])
; CHECK-NEXT: ret void
;
+ %in = alloca i64
call void @llvm.lifetime.start.p0(i64 8, ptr %in)
call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %in, i64 8, i1 false)
ret void
}
; Check that the memcpy is not removed.
-define void @test3(ptr sret(i8) noalias nocapture %out, ptr %in) nounwind noinline ssp uwtable {
+define void @test3(ptr sret(i8) noalias nocapture %out) nounwind noinline ssp uwtable {
; CHECK-LABEL: @test3(
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[IN:%.*]])
+; CHECK-NEXT: [[IN:%.*]] = alloca i64, align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[IN]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[OUT:%.*]], ptr [[IN]], i64 8, i1 false)
; CHECK-NEXT: ret void
;
+ %in = alloca i64
call void @llvm.lifetime.start.p0(i64 4, ptr %in)
call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %in, i64 8, i1 false)
ret void
}
; Check that the memcpy is not removed.
-define void @test_lifetime_may_alias(ptr %lifetime, ptr %src, ptr %dst) {
+define void @test_lifetime_may_alias(ptr %src, ptr %dst) {
; CHECK-LABEL: @test_lifetime_may_alias(
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[LIFETIME:%.*]])
+; CHECK-NEXT: [[LIFETIME:%.*]] = alloca i64, align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[LIFETIME]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 8, i1 false)
; CHECK-NEXT: ret void
;
+ %lifetime = alloca i64
call void @llvm.lifetime.start.p0(i64 8, ptr %lifetime)
call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false)
ret void
diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
index 0c16f34..7ea63bb 100644
--- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
@@ -37,29 +37,10 @@ define void @test_alloca_with_lifetimes(ptr %result) {
ret void
}
-define void @test_malloc_with_lifetimes(ptr %result) {
-; CHECK-LABEL: @test_malloc_with_lifetimes(
-; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 16)
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[A]])
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false)
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[RESULT:%.*]], i8 0, i64 12, i1 false)
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[A]])
-; CHECK-NEXT: call void @free(ptr [[A]])
-; CHECK-NEXT: ret void
-;
- %a = call ptr @malloc(i64 16)
- call void @llvm.lifetime.start.p0(i64 16, ptr %a)
- call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false)
- call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 16, i1 false)
- call void @llvm.lifetime.end.p0(i64 16, ptr %a)
- call void @free(ptr %a)
- ret void
-}
-
; memcpy size is larger than lifetime, don't optimize.
define void @test_copy_larger_than_lifetime_size(ptr %result) {
; CHECK-LABEL: @test_copy_larger_than_lifetime_size(
-; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 16)
+; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[A]])
; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false)
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[RESULT:%.*]], ptr align 8 [[A]], i64 16, i1 false)
@@ -67,7 +48,7 @@ define void @test_copy_larger_than_lifetime_size(ptr %result) {
; CHECK-NEXT: call void @free(ptr [[A]])
; CHECK-NEXT: ret void
;
- %a = call ptr @malloc(i64 16)
+ %a = alloca %T, align 8
call void @llvm.lifetime.start.p0(i64 12, ptr %a)
call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false)
call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 16, i1 false)
diff --git a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
index b654319..ff36bf0 100644
--- a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
+++ b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
@@ -94,21 +94,6 @@ entry:
ret void
}
-define i8 @test6(ptr %ptr, ptr noalias %ptr.1) {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 24, ptr [[PTR:%.*]])
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[PTR]], align 8
-; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[PTR]], ptr [[PTR_1:%.*]], i64 24, i1 false)
-; CHECK-NEXT: ret i8 [[TMP0]]
-;
-entry:
- call void @llvm.lifetime.start.p0(i64 24, ptr %ptr)
- %0 = load i8, ptr %ptr, align 8
- call void @llvm.memmove.p0.p0.i64(ptr %ptr, ptr %ptr.1, i64 24, i1 false)
- ret i8 %0
-}
-
define void @test7(ptr %ptr) {
; CHECK-LABEL: @test7(
; CHECK-NEXT: entry:
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
index 323df12..1784c2f 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
@@ -121,13 +121,14 @@ attributes #6 = { builtin }
!12 = !{i64 789, i64 300}
!13 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !14, producer: "clang version 21.0.0git (git@github.com:llvm/llvm-project.git e391301e0e4d9183fe06e69602e87b0bc889aeda)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
!14 = !DIFile(filename: "basic.cc", directory: "", checksumkind: CSK_MD5, checksum: "8636c46e81402013b9d54e8307d2f149")
-!15 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !13)
+!15 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !13, declaration: !22)
!16 = !DISubroutineType(types: !17)
!17 = !{!18}
!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
!19 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
!20 = !{i32 7, !"Dwarf Version", i32 5}
!21 = !{i32 2, !"Debug Info Version", i32 3}
+!22 = !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
; DUMP: CCG before cloning:
; DUMP: Callsite Context Graph:
@@ -290,7 +291,8 @@ attributes #6 = { builtin }
; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
;; Make sure the clone's linkageName was updated.
-; IR: ![[SP]] = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1"
+; IR: ![[SP]] = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1", {{.*}} declaration: ![[SP2:[0-9]+]])
+; IR: ![[SP2]] = !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1"
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
diff --git a/llvm/test/Transforms/MoveAutoInit/clobber.ll b/llvm/test/Transforms/MoveAutoInit/clobber.ll
index 09084b6..08ffb13 100644
--- a/llvm/test/Transforms/MoveAutoInit/clobber.ll
+++ b/llvm/test/Transforms/MoveAutoInit/clobber.ll
@@ -10,14 +10,14 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 {
; CHECK-NEXT: [[TMP4:%.*]] = alloca [100 x i8], align 16
; CHECK-NEXT: [[TMP5:%.*]] = alloca [2 x i8], align 1
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [100 x i8], ptr [[TMP4]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100, ptr nonnull [[TMP6]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100, ptr nonnull [[TMP4]]) #[[ATTR3:[0-9]+]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8], ptr [[TMP5]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[TMP7]]) #[[ATTR3]]
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[TMP5]]) #[[ATTR3]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8], ptr [[TMP5]], i64 0, i64 1
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP1:%.*]], 0
; CHECK-NEXT: br i1 [[TMP9]], label [[TMP15:%.*]], label [[TMP10:%.*]]
; CHECK: 10:
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) [[TMP6]], i8 -86, i64 100, i1 false), !annotation !0
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) [[TMP6]], i8 -86, i64 100, i1 false), !annotation [[META0:![0-9]+]]
; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP0:%.*]] to i64
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [100 x i8], ptr [[TMP4]], i64 0, i64 [[TMP11]]
; CHECK-NEXT: store i8 12, ptr [[TMP12]], align 1
@@ -28,8 +28,8 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 {
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP2:%.*]], 0
; CHECK-NEXT: br i1 [[TMP16]], label [[TMP22]], label [[TMP17:%.*]]
; CHECK: 17:
-; CHECK-NEXT: store i8 -86, ptr [[TMP7]], align 1, !annotation !0
-; CHECK-NEXT: store i8 -86, ptr [[TMP8]], align 1, !annotation !0
+; CHECK-NEXT: store i8 -86, ptr [[TMP7]], align 1, !annotation [[META0]]
+; CHECK-NEXT: store i8 -86, ptr [[TMP8]], align 1, !annotation [[META0]]
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP0]] to i64
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8], ptr [[TMP5]], i64 0, i64 [[TMP18]]
; CHECK-NEXT: store i8 12, ptr [[TMP19]], align 1
@@ -38,19 +38,19 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 {
; CHECK-NEXT: br label [[TMP22]]
; CHECK: 22:
; CHECK-NEXT: [[TMP23:%.*]] = phi i32 [ [[TMP14]], [[TMP10]] ], [ [[TMP21]], [[TMP17]] ], [ 0, [[TMP15]] ]
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[TMP7]]) #[[ATTR3]]
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 100, ptr nonnull [[TMP6]]) #[[ATTR3]]
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[TMP5]]) #[[ATTR3]]
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 100, ptr nonnull [[TMP4]]) #[[ATTR3]]
; CHECK-NEXT: ret i32 [[TMP23]]
;
%4 = alloca [100 x i8], align 16
%5 = alloca [2 x i8], align 1
%6 = getelementptr inbounds [100 x i8], ptr %4, i64 0, i64 0
- call void @llvm.lifetime.start.p0(i64 100, ptr nonnull %6) #3
+ call void @llvm.lifetime.start.p0(i64 100, ptr nonnull %4) #3
; This memset must move.
call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) %6, i8 -86, i64 100, i1 false), !annotation !0
%7 = getelementptr inbounds [2 x i8], ptr %5, i64 0, i64 0
- call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %7) #3
+ call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %5) #3
; This store must move.
store i8 -86, ptr %7, align 1, !annotation !0
%8 = getelementptr inbounds [2 x i8], ptr %5, i64 0, i64 1
@@ -81,8 +81,8 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 {
22:
%23 = phi i32 [ %14, %10 ], [ %21, %17 ], [ 0, %15 ]
- call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %7) #3
- call void @llvm.lifetime.end.p0(i64 100, ptr nonnull %6) #3
+ call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %5) #3
+ call void @llvm.lifetime.end.p0(i64 100, ptr nonnull %4) #3
ret i32 %23
}
diff --git a/llvm/test/Transforms/NewGVN/lifetime-simple.ll b/llvm/test/Transforms/NewGVN/lifetime-simple.ll
index 55e4611..0a7bd33 100644
--- a/llvm/test/Transforms/NewGVN/lifetime-simple.ll
+++ b/llvm/test/Transforms/NewGVN/lifetime-simple.ll
@@ -4,10 +4,11 @@
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin7"
-define i8 @test(ptr %P) nounwind {
+define i8 @test() nounwind {
; CHECK-LABEL: define i8 @test(
-; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P:%.*]] = alloca [32 x i8], align 1
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[P]])
; CHECK-NEXT: store i8 1, ptr [[P]], align 1
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[P]])
@@ -15,6 +16,7 @@ define i8 @test(ptr %P) nounwind {
; CHECK-NEXT: ret i8 [[TMP0]]
;
entry:
+ %P = alloca [32 x i8]
call void @llvm.lifetime.start.p0(i64 32, ptr %P)
%0 = load i8, ptr %P
store i8 1, ptr %P
diff --git a/llvm/test/Transforms/NewGVN/salvage-eliminate-instruction.ll b/llvm/test/Transforms/NewGVN/salvage-eliminate-instruction.ll
new file mode 100644
index 0000000..d1da7ea
--- /dev/null
+++ b/llvm/test/Transforms/NewGVN/salvage-eliminate-instruction.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -passes=newgvn %s | FileCheck %s
+
+; Check that eliminateInstruction() replaces the debug uses of the instructions
+; marked for deletion with the dominating leader.
+
+define void @binop(i32 %x, i32 %y) !dbg !5 {
+; CHECK: #dbg_value(i32 %add1, [[META9:![0-9]+]], !DIExpression(), [[META12:![0-9]+]])
+; CHECK-NEXT: #dbg_value(i32 %add1, [[META11:![0-9]+]], !DIExpression(), [[META13:![0-9]+]])
+;
+ %add1 = add i32 %x, %y, !dbg !12
+ #dbg_value(i32 %add1, !9, !DIExpression(), !12)
+ %add2 = add i32 %y, %x, !dbg !13
+ #dbg_value(i32 %add2, !11, !DIExpression(), !13)
+ call void @use(i32 %add1, i32 %add2), !dbg !14
+ ret void, !dbg !15
+}
+
+declare void @use(i32, i32)
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "/app/example.ll", directory: "/")
+!2 = !{i32 4}
+!3 = !{i32 2}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "binop", linkageName: "binop", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!9, !11}
+!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+!11 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 2, type: !10)
+!12 = !DILocation(line: 1, column: 1, scope: !5)
+!13 = !DILocation(line: 2, column: 1, scope: !5)
+!14 = !DILocation(line: 3, column: 1, scope: !5)
+!15 = !DILocation(line: 4, column: 1, scope: !5)
+;.
+; CHECK: [[META9]] = !DILocalVariable(name: "1",
+; CHECK: [[META11]] = !DILocalVariable(name: "2",
+; CHECK: [[META12]] = !DILocation(line: 1,
+; CHECK: [[META13]] = !DILocation(line: 2,
+;.
diff --git a/llvm/test/Transforms/NewGVN/salvage-trivially-dead-inst.ll b/llvm/test/Transforms/NewGVN/salvage-trivially-dead-inst.ll
new file mode 100644
index 0000000..cc69541
--- /dev/null
+++ b/llvm/test/Transforms/NewGVN/salvage-trivially-dead-inst.ll
@@ -0,0 +1,33 @@
+; RUN: opt -passes=newgvn -S %s | FileCheck %s
+
+; Check that assignDFSNumbers() in NewGVN salvages the debug values of the
+; trivially dead instructions that are marked for deletion.
+
+; CHECK: #dbg_value(i8 %tmp, [[META11:![0-9]+]], !DIExpression(DW_OP_constu, 8, DW_OP_eq, DW_OP_stack_value), [[META26:![0-9]+]])
+; CHECK: [[META11]] = !DILocalVariable(name: "2"
+; CHECK: [[META26]] = !DILocation(line: 2
+
+define void @test13() !dbg !5 {
+entry:
+ %tmp = load i8, ptr null, align 1
+ %tmp2 = icmp eq i8 %tmp, 8, !dbg !13
+ #dbg_value(i1 %tmp2, !11, !DIExpression(), !13)
+ ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "/app/example.ll", directory: "/")
+!2 = !{i32 3}
+!3 = !{i32 2}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test13", linkageName: "test13", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!11}
+!10 = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned)
+!11 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 2, type: !10)
+!13 = !DILocation(line: 2, column: 1, scope: !5) \ No newline at end of file
diff --git a/llvm/test/Transforms/NewGVN/verify-memoryphi.ll b/llvm/test/Transforms/NewGVN/verify-memoryphi.ll
index 2a1fcf3..a19a2a6 100644
--- a/llvm/test/Transforms/NewGVN/verify-memoryphi.ll
+++ b/llvm/test/Transforms/NewGVN/verify-memoryphi.ll
@@ -10,6 +10,7 @@ declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
define void @tinkywinky() {
; CHECK-LABEL: define void @tinkywinky() {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1
; CHECK-NEXT: br i1 false, label [[BODY:%.*]], label [[END:%.*]]
; CHECK: body:
; CHECK-NEXT: store i8 poison, ptr null, align 1
@@ -18,11 +19,12 @@ define void @tinkywinky() {
; CHECK-NEXT: ret void
;
entry:
- call void @llvm.lifetime.start.p0(i64 4, ptr undef)
+ %a = alloca i8
+ call void @llvm.lifetime.start.p0(i64 4, ptr %a)
br i1 false, label %body, label %end
body:
- call void @llvm.lifetime.start.p0(i64 4, ptr undef)
+ call void @llvm.lifetime.start.p0(i64 4, ptr %a)
br label %end
end:
diff --git a/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll b/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll
index 60180c4..180fd0a 100644
--- a/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll
+++ b/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll
@@ -80,12 +80,14 @@ entry:
; CHECK-LABEL: define ptr @elide_with_retainRV_splitByLifetime(
; CHECK-NEXT: entry:
+; CHECK-NEXT: %x = alloca ptr
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr %x)
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr %x)
; CHECK-NEXT: ret ptr %x
-define ptr @elide_with_retainRV_splitByLifetime(ptr %x) nounwind {
+define ptr @elide_with_retainRV_splitByLifetime() nounwind {
entry:
; Cleanup should skip over lifetime intrinsics.
+ %x = alloca ptr
call void @llvm.lifetime.start(i64 8, ptr %x)
%b = call ptr @llvm.objc.autoreleaseReturnValue(ptr %x) nounwind
call void @llvm.lifetime.end(i64 8, ptr %x)
@@ -218,13 +220,15 @@ entry:
; CHECK-LABEL: define ptr @elide_with_claimRV_splitByLifetime(
; CHECK-NEXT: entry:
+; CHECK-NEXT: %x = alloca ptr
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr %x)
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr %x)
; CHECK-NEXT: tail call void @llvm.objc.release(ptr %x)
; CHECK-NEXT: ret ptr %x
-define ptr @elide_with_claimRV_splitByLifetime(ptr %x) nounwind {
+define ptr @elide_with_claimRV_splitByLifetime() nounwind {
entry:
; Cleanup should skip over lifetime intrinsics.
+ %x = alloca ptr
call void @llvm.lifetime.start(i64 8, ptr %x)
%b = call ptr @llvm.objc.autoreleaseReturnValue(ptr %x) nounwind
call void @llvm.lifetime.end(i64 8, ptr %x)
diff --git a/llvm/test/Transforms/ObjCARC/test_autorelease_pool.ll b/llvm/test/Transforms/ObjCARC/test_autorelease_pool.ll
new file mode 100644
index 0000000..896717f
--- /dev/null
+++ b/llvm/test/Transforms/ObjCARC/test_autorelease_pool.ll
@@ -0,0 +1,319 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; Test for autorelease pool optimizations
+; RUN: opt -passes=objc-arc < %s -S | FileCheck %s
+
+declare ptr @llvm.objc.autoreleasePoolPush()
+declare void @llvm.objc.autoreleasePoolPop(ptr)
+declare ptr @llvm.objc.autorelease(ptr)
+declare ptr @llvm.objc.retain(ptr)
+declare ptr @create_object()
+declare void @use_object(ptr)
+declare ptr @object_with_thing()
+declare void @opaque_callee()
+
+; Empty autorelease pool should be eliminated
+define void @test_empty_pool() {
+; CHECK-LABEL: define void @test_empty_pool() {
+; CHECK-NEXT: ret void
+;
+ %pool = call ptr @llvm.objc.autoreleasePoolPush()
+ call void @llvm.objc.autoreleasePoolPop(ptr %pool)
+ ret void
+}
+
+; Pool with only release should be removed
+define void @test_autorelease_to_release() {
+; CHECK-LABEL: define void @test_autorelease_to_release() {
+; CHECK-NEXT: [[OBJ:%.*]] = call ptr @create_object()
+; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ]]) #[[ATTR0:[0-9]+]], !clang.imprecise_release [[META0:![0-9]+]]
+; CHECK-NEXT: ret void
+;
+ %obj = call ptr @create_object()
+ %pool = call ptr @llvm.objc.autoreleasePoolPush()
+ call ptr @llvm.objc.autorelease(ptr %obj)
+ call void @llvm.objc.autoreleasePoolPop(ptr %pool)
+ ret void
+}
+
+; Pool with autoreleases should not be optimized
+define void @test_multiple_autoreleases() {
+; CHECK-LABEL: define void @test_multiple_autoreleases() {
+; CHECK-NEXT: [[OBJ1:%.*]] = call ptr @create_object()
+; CHECK-NEXT: [[OBJ2:%.*]] = call ptr @create_object()
+; CHECK-NEXT: [[POOL:%.*]] = call ptr @llvm.objc.autoreleasePoolPush() #[[ATTR0]]
+; CHECK-NEXT: call void @use_object(ptr [[OBJ1]])
+; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.objc.autorelease(ptr [[OBJ1]]) #[[ATTR0]]
+; CHECK-NEXT: call void @use_object(ptr [[OBJ2]])
+; CHECK-NEXT: [[TMP2:%.*]] = call ptr @llvm.objc.autorelease(ptr [[OBJ2]]) #[[ATTR0]]
+; CHECK-NEXT: call void @llvm.objc.autoreleasePoolPop(ptr [[POOL]]) #[[ATTR0]]
+; CHECK-NEXT: ret void
+;
+ %obj1 = call ptr @create_object()
+ %obj2 = call ptr @create_object()
+ %pool = call ptr @llvm.objc.autoreleasePoolPush()
+ call void @use_object(ptr %obj1)
+ call ptr @llvm.objc.autorelease(ptr %obj1)
+ call void @use_object(ptr %obj2)
+ call ptr @llvm.objc.autorelease(ptr %obj2)
+ call void @llvm.objc.autoreleasePoolPop(ptr %pool)
+ ret void
+}
+
+; Pool with calls should not be optimized
+define void @test_calls() {
+; CHECK-LABEL: define void @test_calls() {
+; CHECK-NEXT: [[POOL:%.*]] = call ptr @llvm.objc.autoreleasePoolPush() #[[ATTR0]]
+; CHECK-NEXT: [[OBJ1:%.*]] = call ptr @object_with_thing()
+; CHECK-NEXT: call void @use_object(ptr [[OBJ1]])
+; CHECK-NEXT: call void @llvm.objc.autoreleasePoolPop(ptr [[POOL]]) #[[ATTR0]]
+; CHECK-NEXT: ret void
+;
+ %pool = call ptr @llvm.objc.autoreleasePoolPush()
+ %obj1 = call ptr @object_with_thing()
+ call void @use_object(ptr %obj1)
+ call void @llvm.objc.autoreleasePoolPop(ptr %pool)
+ ret void
+}
+
+; Pool with opaque call should not be optimized
+define void @test_opaque_call() {
+; CHECK-LABEL: define void @test_opaque_call() {
+; CHECK-NEXT: [[POOL:%.*]] = call ptr @llvm.objc.autoreleasePoolPush() #[[ATTR0]]
+; CHECK-NEXT: call void @opaque_callee()
+; CHECK-NEXT: call void @llvm.objc.autoreleasePoolPop(ptr [[POOL]]) #[[ATTR0]]
+; CHECK-NEXT: ret void
+;
+ %pool = call ptr @llvm.objc.autoreleasePoolPush()
+ call void @opaque_callee()
+ call void @llvm.objc.autoreleasePoolPop(ptr %pool)
+ ret void
+}
+
+; Nested empty pools should be eliminated
+define void @test_nested_empty_pools() {
+; CHECK-LABEL: define void @test_nested_empty_pools() {
+; CHECK-NEXT: ret void
+;
+ %pool1 = call ptr @llvm.objc.autoreleasePoolPush()
+ %pool2 = call ptr @llvm.objc.autoreleasePoolPush()
+ call void @llvm.objc.autoreleasePoolPop(ptr %pool2)
+ call void @llvm.objc.autoreleasePoolPop(ptr %pool1)
+ ret void
+}
+
+; Empty pool with cast should be eliminated
+define void @test_empty_pool_with_cast() {
+; CHECK-LABEL: define void @test_empty_pool_with_cast() {
+; CHECK-NEXT: [[CAST:%.*]] = bitcast ptr poison to ptr
+; CHECK-NEXT: ret void
+;
+ %pool = call ptr @llvm.objc.autoreleasePoolPush()
+ %cast = bitcast ptr %pool to ptr
+ call void @llvm.objc.autoreleasePoolPop(ptr %cast)
+ ret void
+}
+
+; Autorelease shadowing - autorelease in inner pool doesn't prevent outer optimization
+define void @test_autorelease_shadowing_basic() {
+; CHECK-LABEL: define void @test_autorelease_shadowing_basic() {
+; CHECK-NEXT: [[OBJ:%.*]] = call ptr @create_object()
+; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ]]) #[[ATTR0]], !clang.imprecise_release [[META0]]
+; CHECK-NEXT: ret void
+;
+ %obj = call ptr @create_object()
+ %outer_pool = call ptr @llvm.objc.autoreleasePoolPush()
+
+ ; Inner pool with autorelease - this should be shadowed
+ %inner_pool = call ptr @llvm.objc.autoreleasePoolPush()
+ call ptr @llvm.objc.autorelease(ptr %obj)
+ call void @llvm.objc.autoreleasePoolPop(ptr %inner_pool)
+
+ call void @llvm.objc.autoreleasePoolPop(ptr %outer_pool)
+ ret void
+}
+
+; Multiple nested levels with shadowing
+define void @test_multiple_nested_shadowing() {
+; CHECK-LABEL: define void @test_multiple_nested_shadowing() {
+; CHECK-NEXT: [[OBJ1:%.*]] = call ptr @create_object()
+; CHECK-NEXT: [[OBJ2:%.*]] = call ptr @create_object()
+; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ1]]) #[[ATTR0]], !clang.imprecise_release [[META0]]
+; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ2]]) #[[ATTR0]], !clang.imprecise_release [[META0]]
+; CHECK-NEXT: ret void
+;
+ %obj1 = call ptr @create_object()
+ %obj2 = call ptr @create_object()
+ %outer_pool = call ptr @llvm.objc.autoreleasePoolPush()
+
+ ; First inner pool
+ %inner1_pool = call ptr @llvm.objc.autoreleasePoolPush()
+ call ptr @llvm.objc.autorelease(ptr %obj1)
+ call void @llvm.objc.autoreleasePoolPop(ptr %inner1_pool)
+
+ ; Second inner pool with nested level
+ %inner2_pool = call ptr @llvm.objc.autoreleasePoolPush()
+ %inner3_pool = call ptr @llvm.objc.autoreleasePoolPush()
+ call ptr @llvm.objc.autorelease(ptr %obj2)
+ call void @llvm.objc.autoreleasePoolPop(ptr %inner3_pool)
+ call void @llvm.objc.autoreleasePoolPop(ptr %inner2_pool)
+
+ call void @llvm.objc.autoreleasePoolPop(ptr %outer_pool)
+ ret void
+}
+
+; Autorelease outside inner pool prevents optimization
+define void @test_autorelease_outside_inner_pool() {
+; CHECK-LABEL: define void @test_autorelease_outside_inner_pool() {
+; CHECK-NEXT: [[OBJ1:%.*]] = call ptr @create_object()
+; CHECK-NEXT: [[OBJ2:%.*]] = call ptr @create_object()
+; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ1]]) #[[ATTR0]], !clang.imprecise_release [[META0]]
+; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ2]]) #[[ATTR0]], !clang.imprecise_release [[META0]]
+; CHECK-NEXT: ret void
+;
+ %obj1 = call ptr @create_object()
+ %obj2 = call ptr @create_object()
+ %outer_pool = call ptr @llvm.objc.autoreleasePoolPush()
+
+ ; This autorelease is NOT in an inner pool, so outer pool can't be optimized
+ call ptr @llvm.objc.autorelease(ptr %obj1)
+
+ ; Inner pool with autorelease (shadowed)
+ %inner_pool = call ptr @llvm.objc.autoreleasePoolPush()
+ call ptr @llvm.objc.autorelease(ptr %obj2)
+ call void @llvm.objc.autoreleasePoolPop(ptr %inner_pool)
+
+ call void @llvm.objc.autoreleasePoolPop(ptr %outer_pool)
+ ret void
+}
+
+; Known ObjC functions don't prevent optimization
+define void @test_known_objc_functions() {
+; CHECK-LABEL: define void @test_known_objc_functions() {
+; CHECK-NEXT: [[OBJ:%.*]] = call ptr @create_object()
+; CHECK-NEXT: ret void
+;
+ %obj = call ptr @create_object()
+ %pool = call ptr @llvm.objc.autoreleasePoolPush()
+
+ ; These are all known ObjC runtime functions that don't produce autoreleases
+ %retained = call ptr @llvm.objc.retain(ptr %obj)
+ call void @llvm.objc.release(ptr %obj)
+
+ call void @llvm.objc.autoreleasePoolPop(ptr %pool)
+ ret void
+}
+
+; Complex shadowing with mixed autoreleases
+define void @test_complex_shadowing() {
+; CHECK-LABEL: define void @test_complex_shadowing() {
+; CHECK-NEXT: [[OBJ1:%.*]] = call ptr @create_object()
+; CHECK-NEXT: [[OBJ2:%.*]] = call ptr @create_object()
+; CHECK-NEXT: [[OBJ3:%.*]] = call ptr @create_object()
+; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ1]]) #[[ATTR0]], !clang.imprecise_release [[META0]]
+; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ2]]) #[[ATTR0]], !clang.imprecise_release [[META0]]
+; CHECK-NEXT: [[INNER2_POOL:%.*]] = call ptr @llvm.objc.autoreleasePoolPush() #[[ATTR0]]
+; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.objc.autorelease(ptr [[OBJ3]]) #[[ATTR0]]
+; CHECK-NEXT: call void @llvm.objc.autoreleasePoolPop(ptr [[INNER2_POOL]]) #[[ATTR0]]
+; CHECK-NEXT: ret void
+;
+ %obj1 = call ptr @create_object()
+ %obj2 = call ptr @create_object()
+ %obj3 = call ptr @create_object()
+ %outer_pool = call ptr @llvm.objc.autoreleasePoolPush()
+
+ ; This autorelease is outside inner pools - prevents optimization
+ call ptr @llvm.objc.autorelease(ptr %obj1)
+
+ ; Inner pool 1 with shadowed autorelease
+ %inner1_pool = call ptr @llvm.objc.autoreleasePoolPush()
+ call ptr @llvm.objc.autorelease(ptr %obj2)
+ call void @llvm.objc.autoreleasePoolPop(ptr %inner1_pool)
+
+ ; Some safe ObjC operations
+ %retained = call ptr @llvm.objc.retain(ptr %obj3)
+ call void @llvm.objc.release(ptr %retained)
+
+ ; Inner pool 2 with shadowed autorelease
+ %inner2_pool = call ptr @llvm.objc.autoreleasePoolPush()
+ call ptr @llvm.objc.autorelease(ptr %obj3)
+ call void @llvm.objc.autoreleasePoolPop(ptr %inner2_pool)
+
+ call void @llvm.objc.autoreleasePoolPop(ptr %outer_pool)
+ ret void
+}
+
+; Non-ObjC function that may autorelease prevents optimization
+define void @test_non_objc_may_autorelease() {
+; CHECK-LABEL: define void @test_non_objc_may_autorelease() {
+; CHECK-NEXT: [[POOL:%.*]] = call ptr @llvm.objc.autoreleasePoolPush() #[[ATTR0]]
+; CHECK-NEXT: [[TMP1:%.*]] = call ptr @function_that_might_autorelease()
+; CHECK-NEXT: call void @llvm.objc.autoreleasePoolPop(ptr [[POOL]]) #[[ATTR0]]
+; CHECK-NEXT: ret void
+;
+ %pool = call ptr @llvm.objc.autoreleasePoolPush()
+ call ptr @function_that_might_autorelease()
+ call void @llvm.objc.autoreleasePoolPop(ptr %pool)
+ ret void
+}
+
+; Non-ObjC function that doesn't autorelease allows optimization
+define void @test_non_objc_no_autorelease() {
+; CHECK-LABEL: define void @test_non_objc_no_autorelease() {
+; CHECK-NEXT: call void @safe_function()
+; CHECK-NEXT: ret void
+;
+ %pool = call ptr @llvm.objc.autoreleasePoolPush()
+ call void @safe_function()
+ call void @llvm.objc.autoreleasePoolPop(ptr %pool)
+ ret void
+}
+
+; Incomplete push/pop pairs across blocks - only inner pairs count
+define void @test_incomplete_pairs_inner_shadowing() {
+; CHECK-LABEL: define void @test_incomplete_pairs_inner_shadowing() {
+; CHECK-NEXT: [[OBJ:%.*]] = call ptr @create_object()
+; CHECK-NEXT: [[OUTER_POOL:%.*]] = call ptr @llvm.objc.autoreleasePoolPush() #[[ATTR0]]
+; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ]]) #[[ATTR0]], !clang.imprecise_release [[META0]]
+; CHECK-NEXT: ret void
+;
+ %obj = call ptr @create_object()
+ %outer_pool = call ptr @llvm.objc.autoreleasePoolPush()
+
+ ; Inner complete pair - autorelease should be shadowed by this
+ %inner_pool = call ptr @llvm.objc.autoreleasePoolPush()
+ call ptr @llvm.objc.autorelease(ptr %obj) ; This SHOULD be shadowed by inner pair
+ call void @llvm.objc.autoreleasePoolPop(ptr %inner_pool) ; Completes the inner pair
+
+ ; Note: %outer_pool pop is in a different block (common pattern)
+ ; But the autorelease was shadowed by the complete inner pair
+ ret void
+}
+
+; Helper functions for testing interprocedural analysis
+
+; Safe function that doesn't call autorelease
+define void @safe_function() {
+ ; Just some computation, no autoreleases
+; CHECK-LABEL: define void @safe_function() {
+; CHECK-NEXT: [[X:%.*]] = add i32 1, 2
+; CHECK-NEXT: ret void
+;
+ %x = add i32 1, 2
+ ret void
+}
+
+; Function that may produce autoreleases (simulated by calling autorelease)
+define ptr @function_that_might_autorelease() {
+; CHECK-LABEL: define ptr @function_that_might_autorelease() {
+; CHECK-NEXT: [[OBJ:%.*]] = call ptr @create_object()
+; CHECK-NEXT: [[AUTORELEASED:%.*]] = call ptr @llvm.objc.autorelease(ptr [[OBJ]]) #[[ATTR0]]
+; CHECK-NEXT: ret ptr [[AUTORELEASED]]
+;
+ %obj = call ptr @create_object()
+ %autoreleased = call ptr @llvm.objc.autorelease(ptr %obj)
+ ret ptr %autoreleased
+}
+
+;.
+; CHECK: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/PGOProfile/icp_mismatch_msg.ll b/llvm/test/Transforms/PGOProfile/icp_mismatch_msg.ll
index a81fb36..3ea196a 100644
--- a/llvm/test/Transforms/PGOProfile/icp_mismatch_msg.ll
+++ b/llvm/test/Transforms/PGOProfile/icp_mismatch_msg.ll
@@ -1,8 +1,8 @@
; RUN: opt < %s -passes=pgo-icall-prom -pass-remarks-missed=pgo-icall-prom -S 2>& 1 | FileCheck %s
-; CHECK: remark: <unknown>:0:0: Cannot promote indirect call to func4 with count of 1234: The number of arguments mismatch
-; CHECK: remark: <unknown>:0:0: Cannot promote indirect call: target with md5sum{{.*}} not found
-; CHECK: remark: <unknown>:0:0: Cannot promote indirect call to func2 with count of 7890: Return type mismatch
+; CHECK: remark: <unknown>:0:0: Cannot promote indirect call to func4 (count=1234): The number of arguments mismatch
+; CHECK: remark: <unknown>:0:0: Cannot promote indirect call: target with md5sum {{.*}} not found (count=2345)
+; CHECK: remark: <unknown>:0:0: Cannot promote indirect call to func2 (count=7890): Return type mismatch
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/PGOProfile/indirect_call_promotion2.ll b/llvm/test/Transforms/PGOProfile/indirect_call_promotion2.ll
new file mode 100644
index 0000000..3dfc926
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/indirect_call_promotion2.ll
@@ -0,0 +1,154 @@
+; RUN: opt < %s -passes=pgo-icall-prom -icp-allow-decls=false -icp-allow-hot-only=true -icp-allow-candidate-skip=false -S -pass-remarks=pgo-icall-prom 2>&1 | FileCheck %s --check-prefix=REMARK1
+; RUN: opt < %s -passes=pgo-icall-prom -icp-allow-decls=true -icp-allow-hot-only=true -icp-allow-candidate-skip=false -S -pass-remarks=pgo-icall-prom 2>&1 | FileCheck %s --check-prefixes=REMARK1,REMARK2
+; RUN: opt < %s -passes=pgo-icall-prom -icp-allow-decls=false -icp-allow-hot-only=true -icp-allow-candidate-skip=false -S -pass-remarks=pgo-icall-prom 2>&1 | FileCheck %s --check-prefix=REMARK1
+; RUN: opt < %s -passes=pgo-icall-prom -icp-allow-decls=false -icp-allow-hot-only=false -icp-allow-candidate-skip=false -S -pass-remarks=pgo-icall-prom 2>&1 | FileCheck %s --check-prefixes=REMARK1
+; RUN: opt < %s -passes=pgo-icall-prom -icp-allow-decls=false -icp-allow-hot-only=false -icp-allow-candidate-skip=true -S -pass-remarks=pgo-icall-prom 2>&1 | FileCheck %s --check-prefixes=REMARK1,REMARK3
+; RUN: opt < %s -passes=pgo-icall-prom -icp-allow-decls=true -icp-allow-hot-only=false -icp-allow-candidate-skip=true -S -pass-remarks=pgo-icall-prom 2>&1 | FileCheck %s --check-prefixes=REMARK1,REMARK2,REMARK4,REMARK5
+; RUN: opt < %s -passes=pgo-icall-prom -icp-allow-decls=false -icp-allow-hot-only=false -icp-allow-candidate-skip=true -S -pass-remarks=pgo-icall-prom 2>&1 | FileCheck %s --check-prefixes=REMARK6,REMARK1,REMARK3
+; RUN: opt < %s -passes=pgo-icall-prom -icp-allow-decls=false -icp-allow-hot-only=false -icp-allow-candidate-skip=true -S | FileCheck %s --check-prefix=METADATA
+
+; REMARK6: remark: <unknown>:0:0: Promote indirect call to add with count 20000 out of 60000
+; REMARK2: remark: <unknown>:0:0: Promote indirect call to sub with count 40000 out of 60000
+; REMARK2: remark: <unknown>:0:0: Promote indirect call to add with count 20000 out of 20000
+; REMARK1: remark: <unknown>:0:0: Promote indirect call to add with count 10000 out of 10000
+; REMARK3: remark: <unknown>:0:0: Promote indirect call to add with count 200 out of 400
+; REMARK4: remark: <unknown>:0:0: Promote indirect call to sub with count 200 out of 400
+; REMARK5: remark: <unknown>:0:0: Promote indirect call to add with count 200 out of 200
+
+@math = dso_local local_unnamed_addr global ptr null, align 8
+
+define dso_local i32 @add(i32 noundef %a, i32 noundef %b) !prof !34 {
+entry:
+ %add = add nsw i32 %a, %b
+ ret i32 %add
+}
+
+define dso_local range(i32 0, 2) i32 @main() !prof !35 {
+entry:
+ call void @setup(i32 noundef 0)
+ br label %for.cond
+
+for.cond:
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %sum.0 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %cmp = icmp samesign ult i32 %i.0, 50000
+ br i1 %cmp, label %for.body, label %for.end, !prof !36
+
+for.body:
+ %0 = load ptr, ptr @math, align 8, !tbaa !37
+ %call = call i32 %0(i32 noundef %i.0, i32 noundef %i.0), !prof !41
+; METADATA: %call = call i32 %0(i32 noundef %i.0, i32 noundef %i.0), !prof ![[NEWVP:[0-9]+]]
+; METADATA: ![[NEWVP]] = !{!"VP", i32 0, i64 40000, i64 -455885480058394486, i64 40000}
+ %add = add nsw i32 %sum.0, %call
+ %inc = add nuw nsw i32 %i.0, 1
+ br label %for.cond, !llvm.loop !42
+
+for.end:
+ call void @setup(i32 noundef 1)
+ br label %for.cond1
+
+for.cond1:
+ %i.1 = phi i32 [ 0, %for.end ], [ %inc7, %for.body3 ]
+ %sum.1 = phi i32 [ %sum.0, %for.end ], [ %add5, %for.body3 ]
+ %cmp2 = icmp samesign ult i32 %i.1, 10000
+ br i1 %cmp2, label %for.body3, label %for.cond9, !prof !44
+
+for.body3:
+ %1 = load ptr, ptr @math, align 8, !tbaa !37
+ %call4 = call i32 %1(i32 noundef %i.1, i32 noundef %i.1), !prof !45
+ %add5 = add nsw i32 %sum.1, %call4
+ %inc7 = add nuw nsw i32 %i.1, 1
+ br label %for.cond1, !llvm.loop !46
+
+for.cond9:
+ %i.2 = phi i32 [ %inc15, %for.body11 ], [ 0, %for.cond1 ]
+ %sum.2 = phi i32 [ %add13, %for.body11 ], [ %sum.1, %for.cond1 ]
+ %cmp10 = icmp samesign ult i32 %i.2, 400
+ br i1 %cmp10, label %for.body11, label %for.cond17, !prof !47
+
+for.body11:
+ call void @setup(i32 noundef %i.2)
+ %2 = load ptr, ptr @math, align 8, !tbaa !37
+ %call12 = call i32 %2(i32 noundef %i.2, i32 noundef %i.2), !prof !48
+ %add13 = add nsw i32 %sum.2, %call12
+ %inc15 = add nuw nsw i32 %i.2, 1
+ br label %for.cond9, !llvm.loop !49
+
+for.cond17:
+ %i.3 = phi i32 [ %inc25, %for.body19 ], [ 0, %for.cond9 ]
+ %sum.3 = phi i32 [ %add23, %for.body19 ], [ %sum.2, %for.cond9 ]
+ %cmp18 = icmp samesign ult i32 %i.3, 400
+ br i1 %cmp18, label %for.body19, label %for.end26, !prof !47
+
+for.body19:
+ %add.i = shl nuw nsw i32 %i.3, 1
+ %add21 = add nsw i32 %sum.3, %add.i
+ %call22 = call i32 @sub(i32 noundef %i.3, i32 noundef %i.3)
+ %add23 = add nsw i32 %add21, %call22
+ %inc25 = add nuw nsw i32 %i.3, 1
+ br label %for.cond17, !llvm.loop !50
+
+for.end26:
+ %cmp27 = icmp slt i32 %sum.3, 11
+ %. = zext i1 %cmp27 to i32
+ ret i32 %.
+}
+
+declare void @setup(i32 noundef)
+
+declare i32 @sub(i32 noundef, i32 noundef)
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!33}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 1, !"ProfileSummary", !5}
+!5 = !{!6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
+!6 = !{!"ProfileFormat", !"InstrProf"}
+!7 = !{!"TotalCount", i64 122204}
+!8 = !{!"MaxCount", i64 50600}
+!9 = !{!"MaxInternalCount", i64 10000}
+!10 = !{!"MaxFunctionCount", i64 50600}
+!11 = !{!"NumCounts", i64 9}
+!12 = !{!"NumFunctions", i64 4}
+!13 = !{!"IsPartialProfile", i64 0}
+!14 = !{!"PartialProfileRatio", double 0.000000e+00}
+!15 = !{!"DetailedSummary", !16}
+!16 = !{!17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32}
+!17 = !{i32 10000, i64 50600, i32 1}
+!18 = !{i32 100000, i64 50600, i32 1}
+!19 = !{i32 200000, i64 50600, i32 1}
+!20 = !{i32 300000, i64 50600, i32 1}
+!21 = !{i32 400000, i64 50600, i32 1}
+!22 = !{i32 500000, i64 50000, i32 2}
+!23 = !{i32 600000, i64 50000, i32 2}
+!24 = !{i32 700000, i64 50000, i32 2}
+!25 = !{i32 800000, i64 50000, i32 2}
+!26 = !{i32 900000, i64 10200, i32 3}
+!27 = !{i32 950000, i64 10000, i32 4}
+!28 = !{i32 990000, i64 402, i32 5}
+!29 = !{i32 999000, i64 201, i32 8}
+!30 = !{i32 999900, i64 201, i32 8}
+!31 = !{i32 999990, i64 201, i32 8}
+!32 = !{i32 999999, i64 201, i32 8}
+!33 = !{!"clang version 22.0.0git (git@github.com:llvm/llvm-project.git ac20b28c2be26061e63dceac0915f97ece2273ac)"}
+!34 = !{!"function_entry_count", i64 10200}
+!35 = !{!"function_entry_count", i64 1}
+!36 = !{!"branch_weights", i32 50000, i32 1}
+!37 = !{!38, !38, i64 0}
+!38 = !{!"any pointer", !39, i64 0}
+!39 = !{!"omnipotent char", !40, i64 0}
+!40 = !{!"Simple C/C++ TBAA"}
+!41 = !{!"VP", i32 0, i64 60000, i64 -455885480058394486, i64 40000, i64 2232412992676883508, i64 20000}
+!42 = distinct !{!42, !43}
+!43 = !{!"llvm.loop.mustprogress"}
+!44 = !{!"branch_weights", i32 10000, i32 1}
+!45 = !{!"VP", i32 0, i64 10000, i64 2232412992676883508, i64 10000}
+!46 = distinct !{!46, !43}
+!47 = !{!"branch_weights", i32 400, i32 1}
+!48 = !{!"VP", i32 0, i64 400, i64 -455885480058394486, i64 200, i64 2232412992676883508, i64 200}
+!49 = distinct !{!49, !43}
+!50 = distinct !{!50, !43}
diff --git a/llvm/test/Transforms/PGOProfile/prof-verify-as-needed.ll b/llvm/test/Transforms/PGOProfile/prof-verify-as-needed.ll
new file mode 100644
index 0000000..07e1f2d
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/prof-verify-as-needed.ll
@@ -0,0 +1,20 @@
+; Test that prof-inject only injects missing metadata
+
+; RUN: opt -passes=prof-inject %s -S -o - | FileCheck %s
+
+define void @foo(i32 %i) {
+ %c = icmp eq i32 %i, 0
+ br i1 %c, label %yes, label %no, !prof !0
+yes:
+ br i1 %c, label %yes2, label %no
+yes2:
+ ret void
+no:
+ ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 2}
+; CHECK: br i1 %c, label %yes, label %no, !prof !0
+; CHECK: br i1 %c, label %yes2, label %no, !prof !1
+; CHECK: !0 = !{!"branch_weights", i32 1, i32 2}
+; CHECK: !1 = !{!"branch_weights", i32 3, i32 5}
diff --git a/llvm/test/Transforms/PGOProfile/prof-verify-existing.ll b/llvm/test/Transforms/PGOProfile/prof-verify-existing.ll
new file mode 100644
index 0000000..ea4f0f9
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/prof-verify-existing.ll
@@ -0,0 +1,21 @@
+; Test that prof-inject does not modify existing metadata (incl. "unknown")
+
+; RUN: opt -passes=prof-inject %s -S -o - | FileCheck %s
+; RUN: opt -passes=prof-verify %s -S --disable-output
+
+define void @foo(i32 %i) {
+ %c = icmp eq i32 %i, 0
+ br i1 %c, label %yes, label %no, !prof !0
+yes:
+ br i1 %c, label %yes2, label %no, !prof !1
+yes2:
+ ret void
+no:
+ ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 2}
+!1 = !{!"unknown"}
+; CHECK: br i1 %c, label %yes, label %no, !prof !0
+; CHECK: !0 = !{!"branch_weights", i32 1, i32 2}
+; CHECK: !1 = !{!"unknown"}
diff --git a/llvm/test/Transforms/PGOProfile/prof-verify.ll b/llvm/test/Transforms/PGOProfile/prof-verify.ll
new file mode 100644
index 0000000..c83475a
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/prof-verify.ll
@@ -0,0 +1,19 @@
+; Test prof-inject and prof-verify
+
+; RUN: opt -passes=prof-inject %s -S -o - | FileCheck %s --check-prefix=INJECT
+; RUN: not opt -passes=prof-verify %s -S -o - 2>&1 | FileCheck %s --check-prefix=VERIFY
+; RUN: opt -passes=prof-inject,prof-verify %s --disable-output
+
+define void @foo(i32 %i) {
+ %c = icmp eq i32 %i, 0
+ br i1 %c, label %yes, label %no
+yes:
+ ret void
+no:
+ ret void
+}
+
+; INJECT: br i1 %c, label %yes, label %no, !prof !0
+; INJECT: !0 = !{!"branch_weights", i32 3, i32 5}
+
+; VERIFY: Profile verification failed \ No newline at end of file
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
index 7175816..05674b9 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
@@ -94,7 +94,7 @@ define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data
; CHECK-NEXT: [[DST_ADDR_1]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 48
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT58]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[FOR_END]]:
; CHECK-NEXT: ret i32 0
;
@@ -801,6 +801,8 @@ attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
!4 = distinct !{!4, !5}
!5 = !{!"llvm.loop.mustprogress"}
;.
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]], [[META6:![0-9]+]]}
; CHECK: [[META5]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META6]] = !{!"llvm.loop.unswitch.nontrivial.disable"}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META5]]}
;.
diff --git a/llvm/test/Transforms/SCCP/uscmp.ll b/llvm/test/Transforms/SCCP/uscmp.ll
new file mode 100644
index 0000000..d010c06
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/uscmp.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=sccp -S < %s | FileCheck %s
+
+define i32 @scmp_to_sub(i32 range(i32 -1, 2) %a) {
+; CHECK-LABEL: define i32 @scmp_to_sub(
+; CHECK-SAME: i32 range(i32 -1, 2) [[A:%.*]]) {
+; CHECK-NEXT: [[SCMP:%.*]] = sub nsw i32 [[A]], 0
+; CHECK-NEXT: ret i32 [[SCMP]]
+;
+ %scmp = call i32 @llvm.scmp(i32 %a, i32 0)
+ ret i32 %scmp
+}
+
+define i32 @scmp_zext_to_sub(i1 %a, i1 %b) {
+; CHECK-LABEL: define i32 @scmp_zext_to_sub(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT: [[ZEXT_A:%.*]] = zext i1 [[A]] to i32
+; CHECK-NEXT: [[ZEXT_B:%.*]] = zext i1 [[B]] to i32
+; CHECK-NEXT: [[SCMP:%.*]] = sub nsw i32 [[ZEXT_A]], [[ZEXT_B]]
+; CHECK-NEXT: ret i32 [[SCMP]]
+;
+ %zext_a = zext i1 %a to i32
+ %zext_b = zext i1 %b to i32
+ %scmp = call i32 @llvm.scmp(i32 %zext_a, i32 %zext_b)
+ ret i32 %scmp
+}
+
+define i8 @scmp_to_sub_trunc(i32 range(i32 -1, 2) %a) {
+; CHECK-LABEL: define i8 @scmp_to_sub_trunc(
+; CHECK-SAME: i32 range(i32 -1, 2) [[A:%.*]]) {
+; CHECK-NEXT: [[SCMP1:%.*]] = sub nsw i32 [[A]], 0
+; CHECK-NEXT: [[SCMP:%.*]] = trunc i32 [[SCMP1]] to i8
+; CHECK-NEXT: ret i8 [[SCMP]]
+;
+ %scmp = call i8 @llvm.scmp(i32 %a, i32 0)
+ ret i8 %scmp
+}
+
+define i64 @scmp_to_sub_sext(i32 range(i32 -1, 2) %a) {
+; CHECK-LABEL: define i64 @scmp_to_sub_sext(
+; CHECK-SAME: i32 range(i32 -1, 2) [[A:%.*]]) {
+; CHECK-NEXT: [[SCMP1:%.*]] = sub nsw i32 [[A]], 0
+; CHECK-NEXT: [[SCMP:%.*]] = sext i32 [[SCMP1]] to i64
+; CHECK-NEXT: ret i64 [[SCMP]]
+;
+ %scmp = call i64 @llvm.scmp(i32 %a, i32 0)
+ ret i64 %scmp
+}
+
+define i32 @scmp_to_sub_small_range(i32 range(i32 -1, 1) %a) {
+; CHECK-LABEL: define i32 @scmp_to_sub_small_range(
+; CHECK-SAME: i32 range(i32 -1, 1) [[A:%.*]]) {
+; CHECK-NEXT: [[SCMP:%.*]] = sub nsw i32 [[A]], 0
+; CHECK-NEXT: ret i32 [[SCMP]]
+;
+ %scmp = call i32 @llvm.scmp(i32 %a, i32 0)
+ ret i32 %scmp
+}
+
+define i32 @ucmp_to_sub(i32 range(i32 0, 3) %a) {
+; CHECK-LABEL: define i32 @ucmp_to_sub(
+; CHECK-SAME: i32 range(i32 0, 3) [[A:%.*]]) {
+; CHECK-NEXT: [[SCMP:%.*]] = sub i32 [[A]], 1
+; CHECK-NEXT: ret i32 [[SCMP]]
+;
+ %ucmp = call i32 @llvm.ucmp(i32 %a, i32 1)
+ ret i32 %ucmp
+}
+
+define i8 @ucmp_to_sub_trunc(i32 range(i32 0, 3) %a) {
+; CHECK-LABEL: define i8 @ucmp_to_sub_trunc(
+; CHECK-SAME: i32 range(i32 0, 3) [[A:%.*]]) {
+; CHECK-NEXT: [[UCMP1:%.*]] = sub i32 [[A]], 1
+; CHECK-NEXT: [[UCMP:%.*]] = trunc i32 [[UCMP1]] to i8
+; CHECK-NEXT: ret i8 [[UCMP]]
+;
+ %ucmp = call i8 @llvm.ucmp(i32 %a, i32 1)
+ ret i8 %ucmp
+}
+
+define i64 @ucmp_to_sub_sext(i32 range(i32 0, 3) %a) {
+; CHECK-LABEL: define i64 @ucmp_to_sub_sext(
+; CHECK-SAME: i32 range(i32 0, 3) [[A:%.*]]) {
+; CHECK-NEXT: [[UCMP1:%.*]] = sub i32 [[A]], 1
+; CHECK-NEXT: [[UCMP:%.*]] = sext i32 [[UCMP1]] to i64
+; CHECK-NEXT: ret i64 [[UCMP]]
+;
+ %ucmp = call i64 @llvm.ucmp(i32 %a, i32 1)
+ ret i64 %ucmp
+}
+
+; TODO: we can fold this into %a.
+define i32 @ucmp_to_sub_small_range(i32 range(i32 0, 2) %a) {
+; CHECK-LABEL: define i32 @ucmp_to_sub_small_range(
+; CHECK-SAME: i32 range(i32 0, 2) [[A:%.*]]) {
+; CHECK-NEXT: [[UCMP:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT: ret i32 [[UCMP]]
+;
+ %ucmp = call i32 @llvm.ucmp(i32 %a, i32 0)
+ ret i32 %ucmp
+}
+
+define i32 @scmp_to_sub_large_range(i32 range(i32 -1, 3) %a) {
+; CHECK-LABEL: define i32 @scmp_to_sub_large_range(
+; CHECK-SAME: i32 range(i32 -1, 3) [[A:%.*]]) {
+; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT: ret i32 [[SCMP]]
+;
+ %scmp = call i32 @llvm.scmp(i32 %a, i32 0)
+ ret i32 %scmp
+}
+
+define i32 @ucmp_to_sub_large_range(i32 range(i32 -1, 3) %a) {
+; CHECK-LABEL: define i32 @ucmp_to_sub_large_range(
+; CHECK-SAME: i32 range(i32 -1, 3) [[A:%.*]]) {
+; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT: ret i32 [[SCMP]]
+;
+ %ucmp = call i32 @llvm.ucmp(i32 %a, i32 0)
+ ret i32 %ucmp
+}
+
+define i32 @scmp_to_sub_wrap(i8 range(i8 127, -126) %a) {
+; CHECK-LABEL: define i32 @scmp_to_sub_wrap(
+; CHECK-SAME: i8 range(i8 127, -126) [[A:%.*]]) {
+; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i8(i8 [[A]], i8 -128)
+; CHECK-NEXT: ret i32 [[SCMP]]
+;
+ %scmp = call i32 @llvm.scmp(i8 %a, i8 -128)
+ ret i32 %scmp
+}
+
+define i32 @ucmp_to_sub_wrap(i8 range(i8 -1, 2) %a) {
+; CHECK-LABEL: define i32 @ucmp_to_sub_wrap(
+; CHECK-SAME: i8 range(i8 -1, 2) [[A:%.*]]) {
+; CHECK-NEXT: [[UCMP:%.*]] = call i32 @llvm.ucmp.i32.i8(i8 [[A]], i8 0)
+; CHECK-NEXT: ret i32 [[UCMP]]
+;
+ %ucmp = call i32 @llvm.ucmp(i8 %a, i8 0)
+ ret i32 %ucmp
+}
+
+; It is incorrect to convert a ucmp into sub when the input type is i1.
+define i32 @ucmp_to_sub_i1_rhs_const(i1 %a) {
+; CHECK-LABEL: define i32 @ucmp_to_sub_i1_rhs_const(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i1(i1 [[A]], i1 false)
+; CHECK-NEXT: ret i32 [[SCMP]]
+;
+ %ucmp = call i32 @llvm.ucmp(i1 %a, i1 false)
+ ret i32 %ucmp
+}
+
+; It is incorrect to convert a ucmp into sub when the input type is i1.
+define i32 @ucmp_to_sub_i1_lhs_const(i1 %a) {
+; CHECK-LABEL: define i32 @ucmp_to_sub_i1_lhs_const(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i1(i1 false, i1 [[A]])
+; CHECK-NEXT: ret i32 [[SCMP]]
+;
+ %ucmp = call i32 @llvm.ucmp(i1 false, i1 %a)
+ ret i32 %ucmp
+}
+
+; It is incorrect to convert a ucmp into sub when the input type is i1.
+define i32 @ucmp_to_sub_i1(i1 %a, i1 %b) {
+; CHECK-LABEL: define i32 @ucmp_to_sub_i1(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i1(i1 [[A]], i1 [[B]])
+; CHECK-NEXT: ret i32 [[SCMP]]
+;
+ %ucmp = call i32 @llvm.ucmp(i1 %a, i1 %b)
+ ret i32 %ucmp
+}
+
+; It is incorrect to convert a scmp into sub when the input type is i1.
+define i32 @scmp_to_sub_i1_rhs_const(i1 %a) {
+; CHECK-LABEL: define i32 @scmp_to_sub_i1_rhs_const(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i1(i1 [[A]], i1 false)
+; CHECK-NEXT: ret i32 [[SCMP]]
+;
+ %scmp = call i32 @llvm.scmp(i1 %a, i1 false)
+ ret i32 %scmp
+}
diff --git a/llvm/test/Transforms/SROA/alloca-address-space.ll b/llvm/test/Transforms/SROA/alloca-address-space.ll
index 4c638a9..31305c8 100644
--- a/llvm/test/Transforms/SROA/alloca-address-space.ll
+++ b/llvm/test/Transforms/SROA/alloca-address-space.ll
@@ -140,12 +140,10 @@ define void @addressspace_alloca_lifetime() {
; CHECK-NEXT: ret void
;
%alloca = alloca i8, align 8, addrspace(2)
- %cast = addrspacecast ptr addrspace(2) %alloca to ptr
- call void @llvm.lifetime.start.p0(i64 2, ptr %cast)
+ call void @llvm.lifetime.start(i64 2, ptr addrspace(2) %alloca)
ret void
}
-declare void @llvm.lifetime.start.p0(i64 %size, ptr nocapture %ptr)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK-MODIFY-CFG: {{.*}}
; CHECK-PRESERVE-CFG: {{.*}}
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index 145da52..3034aaa 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -1834,8 +1834,7 @@ define void @PR27999() unnamed_addr {
entry-block:
%0 = alloca [2 x i64], align 8
call void @llvm.lifetime.start.p0(i64 16, ptr %0)
- %1 = getelementptr inbounds [2 x i64], ptr %0, i32 0, i32 1
- call void @llvm.lifetime.end.p0(i64 8, ptr %1)
+ call void @llvm.lifetime.end.p0(i64 8, ptr %0)
ret void
}
diff --git a/llvm/test/Transforms/SROA/ignore-droppable.ll b/llvm/test/Transforms/SROA/ignore-droppable.ll
index 0b9a036b..9c95dc0 100644
--- a/llvm/test/Transforms/SROA/ignore-droppable.ll
+++ b/llvm/test/Transforms/SROA/ignore-droppable.ll
@@ -55,10 +55,10 @@ define void @positive_gep_assume_uses() {
;
%A = alloca {i8, i16}
%B = getelementptr {i8, i16}, ptr %A, i32 0, i32 0
- call void @llvm.lifetime.start.p0(i64 2, ptr %B)
+ call void @llvm.lifetime.start.p0(i64 2, ptr %A)
call void @llvm.assume(i1 true) ["align"(ptr %B, i64 8), "align"(ptr %B, i64 16)]
store {i8, i16} zeroinitializer, ptr %A
- call void @llvm.lifetime.end.p0(i64 2, ptr %B)
+ call void @llvm.lifetime.end.p0(i64 2, ptr %A)
call void @llvm.assume(i1 true) ["nonnull"(ptr %B), "align"(ptr %B, i64 2)]
ret void
}
diff --git a/llvm/test/Transforms/SafeStack/X86/coloring2.ll b/llvm/test/Transforms/SafeStack/X86/coloring2.ll
index 2e02ea6..ae5f375 100644
--- a/llvm/test/Transforms/SafeStack/X86/coloring2.ll
+++ b/llvm/test/Transforms/SafeStack/X86/coloring2.ll
@@ -478,43 +478,6 @@ l2:
br label %l2
}
-; This test checks for a bug where the stack coloring algorithm was not tracking
-; the live range of allocas through phi instructions, so it did not consider
-; alloca and alloca2 to be live at the same time. As a result it was using
-; the same stack slot for both allocas. To ensure this bug isn't present, we
-; check that there are 64 bytes allocated for the unsafe stack which is enough
-; space for both allocas.
-; CHECK-LABEL: @stack_coloring_liveness_bug
-define void @stack_coloring_liveness_bug(i32 %arg0) #0 {
-entry:
-; CHECK: %[[USP:.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr
-; CHECK-NEXT: getelementptr i8, ptr %[[USP]], i32 -64
- %alloca = alloca [32 x i8], align 16
- %alloca2 = alloca [32 x i8], align 16
- %cond = icmp eq i32 %arg0, 0
- br i1 %cond, label %if, label %else
-
-if:
- br label %end
-
-else:
-; CHECK: getelementptr i8, ptr %[[USP]], i32 -32
- call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloca)
- call void @capture8(ptr %alloca)
- call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloca)
- br label %end
-
-end:
-; CHECK: getelementptr i8, ptr %[[USP]], i32 -64
- %alloca.end = phi ptr [ %alloca, %if], [%alloca, %else]
- call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloca2)
- call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloca.end)
- call void @capture2_8(ptr %alloca2, ptr %alloca.end)
- call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloca2)
- call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloca.end)
- ret void
-}
-
attributes #0 = { safestack }
declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
diff --git a/llvm/test/Transforms/Scalarizer/extractvalue-struct-of-vectors.ll b/llvm/test/Transforms/Scalarizer/extractvalue-struct-of-vectors.ll
new file mode 100644
index 0000000..b8d1b92
--- /dev/null
+++ b/llvm/test/Transforms/Scalarizer/extractvalue-struct-of-vectors.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='function(scalarizer)' -S < %s | FileCheck %s
+
+define void @func(<2 x i32> noundef %a, <2 x i32> noundef %b) {
+; CHECK-LABEL: define void @func(
+; CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT: [[A_I0:%.*]] = extractelement <2 x i32> [[A]], i64 0
+; CHECK-NEXT: [[B_I0:%.*]] = extractelement <2 x i32> [[B]], i64 0
+; CHECK-NEXT: [[UADDC_I0:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A_I0]], i32 [[B_I0]])
+; CHECK-NEXT: [[A_I1:%.*]] = extractelement <2 x i32> [[A]], i64 1
+; CHECK-NEXT: [[B_I1:%.*]] = extractelement <2 x i32> [[B]], i64 1
+; CHECK-NEXT: [[UADDC_I1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A_I1]], i32 [[B_I1]])
+; CHECK-NEXT: [[CARRY_ELEM1:%.*]] = extractvalue { i32, i1 } [[UADDC_I0]], 1
+; CHECK-NEXT: [[CARRY_ELEM11:%.*]] = extractvalue { i32, i1 } [[UADDC_I1]], 1
+; CHECK-NEXT: [[CARRY_ZEXT_I0:%.*]] = zext i1 [[CARRY_ELEM1]] to i32
+; CHECK-NEXT: [[CARRY_ZEXT_I1:%.*]] = zext i1 [[CARRY_ELEM11]] to i32
+; CHECK-NEXT: ret void
+;
+ %uaddc = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
+ %carry = extractvalue { <2 x i32>, <2 x i1> } %uaddc, 1
+ %carry_zext = zext <2 x i1> %carry to <2 x i32>
+ ret void
+}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll
index f82d730..6f2833b 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll
@@ -45,7 +45,7 @@
;
; RUN: opt < %s -enable-unswitch-cost-multiplier=false \
; RUN: -passes='loop-mssa(licm,simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | \
-; RUN: sort -b -k 1 | FileCheck %s --check-prefixes=LOOP32
+; RUN: sort -b -k 1 | FileCheck %s --check-prefixes=LOOP6
;
; Single loop nest, not unswitched
; LOOP1: Loop at depth 1 containing:
@@ -55,23 +55,23 @@
;
; Half unswitched loop nests, with unscaled4 and div1 it gets less depth1 loops unswitched
; since they have more cost.
-; LOOP-UNSCALE4-DIV1-COUNT-6: Loop at depth 1 containing:
-; LOOP-UNSCALE4-DIV1-COUNT-19: Loop at depth 2 containing:
-; LOOP-UNSCALE4-DIV1-COUNT-29: Loop at depth 3 containing:
+; LOOP-UNSCALE4-DIV1-COUNT-4: Loop at depth 1 containing:
+; LOOP-UNSCALE4-DIV1-COUNT-4: Loop at depth 2 containing:
+; LOOP-UNSCALE4-DIV1-COUNT-4: Loop at depth 3 containing:
; LOOP-UNSCALE4-DIV1-NOT: Loop at depth {{[0-9]+}} containing:
;
; Half unswitched loop nests, with unscaled4 and div2 it gets more depth1 loops unswitched
; as div2 kicks in.
-; LOOP-UNSCALE4-DIV2-COUNT-11: Loop at depth 1 containing:
-; LOOP-UNSCALE4-DIV2-COUNT-22: Loop at depth 2 containing:
-; LOOP-UNSCALE4-DIV2-COUNT-29: Loop at depth 3 containing:
+; LOOP-UNSCALE4-DIV2-COUNT-4: Loop at depth 1 containing:
+; LOOP-UNSCALE4-DIV2-COUNT-4: Loop at depth 2 containing:
+; LOOP-UNSCALE4-DIV2-COUNT-4: Loop at depth 3 containing:
; LOOP-UNSCALE4-DIV2-NOT: Loop at depth {{[0-9]+}} containing:
;
-; 32 loop nests, fully unswitched
-; LOOP32-COUNT-32: Loop at depth 1 containing:
-; LOOP32-COUNT-32: Loop at depth 2 containing:
-; LOOP32-COUNT-32: Loop at depth 3 containing:
-; LOOP32-NOT: Loop at depth {{[0-9]+}} containing:
+; 6 loop nests, fully unswitched
+; LOOP6-COUNT-6: Loop at depth 1 containing:
+; LOOP6-COUNT-6: Loop at depth 2 containing:
+; LOOP6-COUNT-6: Loop at depth 3 containing:
+; LOOP6-NOT: Loop at depth {{[0-9]+}} containing:
declare void @bar()
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll
index 63d2789..ab3b3d2 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll
@@ -60,7 +60,7 @@
;
; Half unswitched loop nests, with unscaled3 and div1 it gets less depth1 loops unswitched
; since they have more cost.
-; LOOP-UNSCALE3-DIV1-COUNT-4: Loop at depth 1 containing:
+; LOOP-UNSCALE3-DIV1-COUNT-2: Loop at depth 1 containing:
; LOOP-UNSCALE3-DIV1-NOT: Loop at depth 1 containing:
; LOOP-UNSCALE3-DIV1-COUNT-1: Loop at depth 2 containing:
; LOOP-UNSCALE3-DIV1-NOT: Loop at depth 2 containing:
@@ -69,7 +69,7 @@
;
; Half unswitched loop nests, with unscaled3 and div2 it gets more depth1 loops unswitched
; as div2 kicks in.
-; LOOP-UNSCALE3-DIV2-COUNT-6: Loop at depth 1 containing:
+; LOOP-UNSCALE3-DIV2-COUNT-2: Loop at depth 1 containing:
; LOOP-UNSCALE3-DIV2-NOT: Loop at depth 1 containing:
; LOOP-UNSCALE3-DIV2-COUNT-1: Loop at depth 2 containing:
; LOOP-UNSCALE3-DIV2-NOT: Loop at depth 2 containing:
@@ -77,7 +77,7 @@
; LOOP-UNSCALE3-DIV2-NOT: Loop at depth 3 containing:
;
; Maximally unswitched (copy of the outer loop per each condition)
-; LOOP-MAX-COUNT-6: Loop at depth 1 containing:
+; LOOP-MAX-COUNT-2: Loop at depth 1 containing:
; LOOP-MAX-NOT: Loop at depth 1 containing:
; LOOP-MAX-COUNT-1: Loop at depth 2 containing:
; LOOP-MAX-NOT: Loop at depth 2 containing:
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll
index a2a745f..7515cbb 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll
@@ -25,46 +25,37 @@
;
; RUN: opt < %s -enable-unswitch-cost-multiplier=true \
; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \
-; RUN: -passes='loop(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP5
+; RUN: -passes='loop(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP4
;
; RUN: opt < %s -enable-unswitch-cost-multiplier=true \
; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \
-; RUN: -passes='loop-mssa(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP5
-;
-; With relaxed candidates multiplier (unscaled candidates == 8) and with relaxed
-; siblings multiplier for top-level loops (toplevel-div == 8) we should get
-; 2^(num conds) == 2^5 == 32
-; copies of the loop:
+; RUN: -passes='loop-mssa(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP4
;
; RUN: opt < %s -enable-unswitch-cost-multiplier=true \
; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \
-; RUN: -passes='loop(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32
+; RUN: -passes='loop(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP6
;
; RUN: opt < %s -enable-unswitch-cost-multiplier=true \
; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \
-; RUN: -passes='loop-mssa(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32
-;
-; Similarly get
-; 2^(num conds) == 2^5 == 32
-; copies of the loop when cost multiplier is disabled:
+; RUN: -passes='loop-mssa(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP6
;
; RUN: opt < %s -enable-unswitch-cost-multiplier=false \
-; RUN: -passes='loop(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32
+; RUN: -passes='loop(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP6
;
; RUN: opt < %s -enable-unswitch-cost-multiplier=false \
-; RUN: -passes='loop-mssa(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32
+; RUN: -passes='loop-mssa(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP6
;
; Single loop, not unswitched
; LOOP1: Loop at depth 1 containing:
; LOOP1-NOT: Loop at depth 1 containing:
-; 5 loops, unswitched 4 times
-; LOOP5-COUNT-5: Loop at depth 1 containing:
-; LOOP5-NOT: Loop at depth 1 containing:
+; 4 loops, unswitched 4 times
+; LOOP4-COUNT-4: Loop at depth 1 containing:
+; LOOP4-NOT: Loop at depth 1 containing:
-; 32 loops, fully unswitched
-; LOOP32-COUNT-32: Loop at depth 1 containing:
-; LOOP32-NOT: Loop at depth 1 containing:
+; 6 loops, fully unswitched
+; LOOP6-COUNT-6: Loop at depth 1 containing:
+; LOOP6-NOT: Loop at depth 1 containing:
define void @loop_simple5(ptr %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) {
entry:
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll
index 96fe899..846a779 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll
@@ -61,19 +61,19 @@
; Somewhat relaxed restrictions on candidates:
; LOOP-RELAX-COUNT-5: Loop at depth 1 containing:
; LOOP-RELAX-NOT: Loop at depth 1 containing:
-; LOOP-RELAX-COUNT-32: Loop at depth 2 containing:
+; LOOP-RELAX-COUNT-5: Loop at depth 2 containing:
; LOOP-RELAX-NOT: Loop at depth 2 containing:
;
; Even more relaxed restrictions on candidates and siblings.
-; LOOP-RELAX2-COUNT-11: Loop at depth 1 containing:
+; LOOP-RELAX2-COUNT-5: Loop at depth 1 containing:
; LOOP-RELAX2-NOT: Loop at depth 1 containing:
-; LOOP-RELAX2-COUNT-40: Loop at depth 2 containing:
+; LOOP-RELAX2-COUNT-5: Loop at depth 2 containing:
; LOOP-RELAX-NOT: Loop at depth 2 containing:
;
; Unswitched as much as it could (with multiplier disabled).
-; LOOP-MAX-COUNT-56: Loop at depth 1 containing:
+; LOOP-MAX-COUNT-6: Loop at depth 1 containing:
; LOOP-MAX-NOT: Loop at depth 1 containing:
-; LOOP-MAX-COUNT-111: Loop at depth 2 containing:
+; LOOP-MAX-COUNT-11: Loop at depth 2 containing:
; LOOP-MAX-NOT: Loop at depth 2 containing:
define i32 @loop_switch(ptr %addr, i32 %c1, i32 %c2) {
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
index 533b1f691..c77e7cc 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -38,25 +38,25 @@ exit:
}
define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
-; CHECK-LABEL: @test_two_guards(
+; CHECK-LABEL: define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK-NEXT: br i1 %cond1, label %entry.split.us, label %entry.split
; CHECK: entry.split.us:
-; CHECK-NEXT: br i1 [[COND2:%.*]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]]
-; CHECK: entry.split.us.split.us:
-; CHECK-NEXT: br label [[LOOP_US_US:%.*]]
-; CHECK: loop.us.us:
-; CHECK-NEXT: [[IV_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], [[GUARDED_US2:%.*]] ]
-; CHECK-NEXT: br label [[GUARDED_US_US:%.*]]
-; CHECK: guarded.us.us:
-; CHECK-NEXT: br label [[GUARDED_US2]]
-; CHECK: guarded.us2:
-; CHECK-NEXT: [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1
-; CHECK-NEXT: [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[LOOP_COND_US_US]], label [[LOOP_US_US]], label [[EXIT_SPLIT_US_SPLIT_US:%.*]]
-; CHECK: deopt1:
-; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
-; CHECK-NEXT: unreachable
+; CHECK-NEXT: br label %loop.us
+; CHECK: loop.us:
+; CHECK-NEXT: %iv.us = phi i32 [ 0, %entry.split.us ], [ %iv.next.us, %guarded.us ]
+; CHECK-NEXT: br label %guarded.us
+; CHECK: guarded.us:
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %cond2) [ "deopt"() ]
+; CHECK-NEXT: %iv.next.us = add i32 %iv.us, 1
+; CHECK-NEXT: %loop.cond.us = icmp slt i32 %iv.next.us, %N
+; CHECK-NEXT: br i1 %loop.cond.us, label %loop.us, label %exit.split.us, !llvm.loop !2
+; CHECK: exit.split.us:
+; CHECK-NEXT: br label %exit
+; CHECK: entry.split:
+; CHECK-NEXT: br label %loop
+; CHECK: loop:
+; CHECK-NEXT: br label %deopt
; CHECK: deopt:
; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
; CHECK-NEXT: unreachable
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6..3dc8320 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -5,7 +5,7 @@
define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
; CHECK-LABEL: @test_01(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0:![0-9]+]]
; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
; CHECK: loop.us:
@@ -20,7 +20,7 @@ define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]]
+; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
@@ -35,7 +35,7 @@ define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: common.ret:
; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
; CHECK-NEXT: ret i32 [[COMMON_RET_OP]]
@@ -76,7 +76,7 @@ range_check_failed: ; preds = %guarded
define i32 @test_01_neg_void_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
; CHECK-LABEL: @test_01_neg_void_profile(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
@@ -133,7 +133,7 @@ range_check_failed: ; preds = %guarded
define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p) {
; CHECK-LABEL: @test_01_constants(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]]
; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 200, 300
; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
; CHECK: loop.us:
@@ -148,7 +148,7 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p
; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], 1000
-; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]]
+; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
@@ -160,7 +160,7 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p
; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: common.ret:
; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ]
; CHECK-NEXT: ret i32 [[COMMON_RET_OP]]
@@ -200,7 +200,7 @@ range_check_failed: ; preds = %guarded
define i32 @test_01_neg_degenerate_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
; CHECK-LABEL: @test_01_neg_degenerate_profile(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
@@ -210,7 +210,7 @@ define i32 @test_01_neg_degenerate_profile(ptr noundef %p, i32 noundef %n, i32 n
; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
; CHECK: guarded:
; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF5:![0-9]+]]
+; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF8:![0-9]+]]
; CHECK: backedge:
; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4
@@ -257,7 +257,7 @@ range_check_failed: ; preds = %guarded
define i32 @test_01_neg_cold(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
; CHECK-LABEL: @test_01_neg_cold(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
@@ -267,7 +267,7 @@ define i32 @test_01_neg_cold(ptr noundef %p, i32 noundef %n, i32 noundef %limit,
; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
; CHECK: guarded:
; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF6:![0-9]+]]
+; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF9:![0-9]+]]
; CHECK: backedge:
; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4
@@ -314,17 +314,17 @@ range_check_failed: ; preds = %guarded
define i32 @test_01_neg_overflowing_metadata(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
; CHECK-LABEL: @test_01_neg_overflowing_metadata(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]]
-; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF7:![0-9]+]]
+; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF10:![0-9]+]]
; CHECK: guarded:
; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7]]
+; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF10]]
; CHECK: backedge:
; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4
@@ -371,7 +371,7 @@ range_check_failed: ; preds = %guarded
define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
; CHECK-LABEL: @test_02(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]]
; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]]
; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
; CHECK: loop.us:
@@ -386,7 +386,7 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]]
+; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
@@ -401,7 +401,7 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: common.ret:
; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
; CHECK-NEXT: ret i32 [[COMMON_RET_OP]]
@@ -441,7 +441,7 @@ range_check_failed: ; preds = %guarded
define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
; CHECK-LABEL: @test_02_inverse(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]]
; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]]
; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
; CHECK: loop.us:
@@ -456,7 +456,7 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit,
; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]]
+; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
@@ -471,7 +471,7 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit,
; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: common.ret:
; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
; CHECK-NEXT: ret i32 [[COMMON_RET_OP]]
@@ -511,7 +511,7 @@ range_check_failed: ; preds = %guarded
define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
; CHECK-LABEL: @test_03(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]]
; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]]
; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
; CHECK: loop.us:
@@ -519,20 +519,20 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp slt i32 [[EL_US]], 0
-; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10:![0-9]+]]
+; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF15:![0-9]+]]
; CHECK: guarded.us:
; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]]
+; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp slt i32 [[EL]], 0
-; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]]
+; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF15]]
; CHECK: guarded:
; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
@@ -541,7 +541,7 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: common.ret:
; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
; CHECK-NEXT: ret i32 [[COMMON_RET_OP]]
@@ -581,7 +581,7 @@ range_check_failed: ; preds = %guarded
define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
; CHECK-LABEL: @test_04(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]]
; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 128, [[X]]
; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
; CHECK: loop.us:
@@ -589,7 +589,7 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[IV_US]]
; CHECK-NEXT: [[EL_US:%.*]] = load i8, ptr [[EL_PTR_US]], align 4
; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp slt i8 [[EL_US]], 0
-; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10]]
+; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF15]]
; CHECK: guarded.us:
; CHECK-NEXT: [[EL_WIDE_US:%.*]] = zext i8 [[EL_US]] to i32
; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_WIDE_US]], [[X]]
@@ -597,13 +597,13 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]]
+; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i8, ptr [[P]], i32 [[IV]]
; CHECK-NEXT: [[EL:%.*]] = load i8, ptr [[EL_PTR]], align 4
; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp slt i8 [[EL]], 0
-; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]]
+; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF15]]
; CHECK: guarded:
; CHECK-NEXT: [[EL_WIDE:%.*]] = zext i8 [[EL]] to i32
; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL_WIDE]], [[X]]
@@ -613,7 +613,7 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK: common.ret:
; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
; CHECK-NEXT: ret i32 [[COMMON_RET_OP]]
@@ -651,17 +651,24 @@ range_check_failed: ; preds = %guarded
ret i32 -2
}
;.
-; CHECK: [[META0:![0-9]+]] = !{}
+; CHECK: [[META0]] = !{}
; CHECK: [[PROF1]] = !{!"branch_weights", i32 100, i32 1}
-; CHECK: [[LOOP2]] = distinct !{!2, !3}
-; CHECK: [[META3:![0-9]+]] = !{!"llvm.loop.unswitch.injection.disable"}
-; CHECK: [[LOOP4]] = distinct !{!4, !3}
-; CHECK: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK: [[PROF6]] = !{!"branch_weights", i32 2, i32 3}
-; CHECK: [[PROF7]] = !{!"branch_weights", i32 -1, i32 -1000}
-; CHECK: [[LOOP8]] = distinct !{!8, !3}
-; CHECK: [[LOOP9]] = distinct !{!9, !3}
-; CHECK: [[PROF10]] = !{!"branch_weights", i32 1, i32 100}
-; CHECK: [[LOOP11]] = distinct !{!11, !3}
-; CHECK: [[LOOP12]] = distinct !{!12, !3}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
+; CHECK: [[META3]] = !{!"llvm.loop.unswitch.nontrivial.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"llvm.loop.unswitch.injection.disable"}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META5]]}
+; CHECK: [[PROF8]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK: [[PROF9]] = !{!"branch_weights", i32 2, i32 3}
+; CHECK: [[PROF10]] = !{!"branch_weights", i32 -1, i32 -1000}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META3]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META3]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META5]]}
+; CHECK: [[PROF15]] = !{!"branch_weights", i32 1, i32 100}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META3]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META5]]}
+; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META3]]}
+; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META5]]}
;.
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/invalidate-block-and-loop-dispositions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/invalidate-block-and-loop-dispositions.ll
index fcef886..5f713fa 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/invalidate-block-and-loop-dispositions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/invalidate-block-and-loop-dispositions.ll
@@ -14,27 +14,17 @@ define void @test_pr58136(i1 %c.1, i1 %c.2) {
; CHECK-NEXT: [[C_1_FR:%.*]] = freeze i1 [[C_1:%.*]]
; CHECK-NEXT: br i1 [[C_1_FR]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
; CHECK: entry.split.us:
-; CHECK-NEXT: [[C_2_FR:%.*]] = freeze i1 [[C_2:%.*]]
-; CHECK-NEXT: br i1 [[C_2_FR]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]]
-; CHECK: entry.split.us.split.us:
; CHECK-NEXT: br label [[LOOP_HEADER_US_US:%.*]]
-; CHECK: loop.header.us.us:
-; CHECK-NEXT: [[MUL1_US_US:%.*]] = phi i16 [ [[MUL_US_US:%.*]], [[LOOP_LATCH_US_US:%.*]] ], [ [[GLOB_PROMOTED]], [[ENTRY_SPLIT_US_SPLIT_US]] ]
+; CHECK: loop.header.us:
+; CHECK-NEXT: [[MUL1_US_US:%.*]] = phi i16 [ [[MUL_US_US:%.*]], [[LOOP_LATCH_US:%.*]] ], [ [[GLOB_PROMOTED]], [[ENTRY_SPLIT_US]] ]
; CHECK-NEXT: [[CALL2_US_US:%.*]] = call i16 @foo()
-; CHECK-NEXT: br label [[THEN_BB_US_US:%.*]]
-; CHECK: then.bb.us.us:
-; CHECK-NEXT: br label [[LOOP_LATCH_US_US]]
-; CHECK: loop.latch.us.us:
+; CHECK-NEXT: br label [[LOOP_LATCH_US_US:%.*]]
+; CHECK: then.bb.us:
+; CHECK-NEXT: br i1 [[C_2:%.*]], label [[LOOP_LATCH_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK: loop.latch.us:
; CHECK-NEXT: [[MUL_US_US]] = mul nsw i16 [[MUL1_US_US]], [[L_3]]
; CHECK-NEXT: store i16 [[MUL_US_US]], ptr @glob, align 2
-; CHECK-NEXT: br label [[LOOP_HEADER_US_US]]
-; CHECK: entry.split.us.split:
-; CHECK-NEXT: br label [[LOOP_HEADER_US:%.*]]
-; CHECK: loop.header.us:
-; CHECK-NEXT: [[CALL2_US:%.*]] = call i16 @foo()
-; CHECK-NEXT: br label [[THEN_BB_US:%.*]]
-; CHECK: then.bb.us:
-; CHECK-NEXT: br label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br label [[LOOP_HEADER_US_US]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: entry.split:
@@ -89,7 +79,7 @@ define void @test_pr58158(i1 %c.1) {
; CHECK: outer.loopexit.us:
; CHECK-NEXT: br label [[OUTER_BACKEDGE_US:%.*]]
; CHECK: outer.backedge.us:
-; CHECK-NEXT: br label [[OUTER_US]]
+; CHECK-NEXT: br label [[OUTER_US]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: entry.split:
; CHECK-NEXT: br label [[OUTER:%.*]]
; CHECK: outer:
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll
index 8e97cb5..d07c2fa 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll
@@ -32,7 +32,7 @@ define i32 @test1_freeze(ptr %ptr0, ptr %ptr1, ptr %ptr2) {
; CHECK-NEXT: br label [[LATCH_US:%.*]]
; CHECK: latch.us:
; CHECK-NEXT: [[V_US:%.*]] = load i1, ptr [[PTR0:%.*]], align 1
-; CHECK-NEXT: br i1 [[V_US]], label [[LOOP_BEGIN_US]], label [[LOOP_EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[V_US]], label [[LOOP_BEGIN_US]], label [[LOOP_EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: loop_exit.split.us:
; CHECK-NEXT: br label [[LOOP_EXIT:%.*]]
; CHECK: entry.split:
@@ -50,7 +50,7 @@ define i32 @test1_freeze(ptr %ptr0, ptr %ptr1, ptr %ptr2) {
; CHECK-NEXT: br label [[LATCH_US2:%.*]]
; CHECK: latch.us2:
; CHECK-NEXT: [[V_US3:%.*]] = load i1, ptr [[PTR0]], align 1
-; CHECK-NEXT: br i1 [[V_US3]], label [[LOOP_BEGIN_US1]], label [[LOOP_EXIT_SPLIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[V_US3]], label [[LOOP_BEGIN_US1]], label [[LOOP_EXIT_SPLIT_SPLIT_US:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: loop_exit.split.split.us:
; CHECK-NEXT: br label [[LOOP_EXIT_SPLIT:%.*]]
; CHECK: entry.split.split:
@@ -276,7 +276,7 @@ define i32 @test7b(ptr %ptr, ptr %cond.ptr, ptr %a.ptr, ptr %b.ptr) {
; CHECK-NEXT: [[V4_US:%.*]] = load i1, ptr [[PTR]], align 1
; CHECK-NEXT: br i1 [[V4_US]], label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_D_US:%.*]]
; CHECK: inner_inner_loop_d.us:
-; CHECK-NEXT: br label [[INNER_INNER_LOOP_BEGIN_US]]
+; CHECK-NEXT: br label [[INNER_INNER_LOOP_BEGIN_US]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: inner_inner_loop_exit.split.us:
; CHECK-NEXT: br label [[INNER_INNER_LOOP_EXIT]]
; CHECK: loop_exit.split.us:
@@ -512,7 +512,7 @@ define i32 @test8b(ptr %ptr, ptr %cond.ptr, ptr %a.ptr, ptr %b.ptr) {
; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1
; CHECK-NEXT: br i1 [[V2_US]], label [[INNER_INNER_LOOP_LATCH_US:%.*]], label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]]
; CHECK: inner_inner_loop_latch.us:
-; CHECK-NEXT: br label [[INNER_INNER_LOOP_BEGIN_US]]
+; CHECK-NEXT: br label [[INNER_INNER_LOOP_BEGIN_US]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: inner_inner_loop_exit.split.us:
; CHECK-NEXT: br label [[INNER_INNER_LOOP_EXIT]]
; CHECK: inner_loop_exit.loopexit.split.us:
@@ -614,7 +614,7 @@ define i32 @test10a(ptr %ptr, i1 %cond, ptr %a.ptr) {
; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1
; CHECK-NEXT: br i1 [[V2_US]], label [[LOOP_EXIT_SPLIT_US_LOOPEXIT:%.*]], label [[LOOP_BEGIN_BACKEDGE_US:%.*]]
; CHECK: loop_begin.backedge.us:
-; CHECK-NEXT: br label [[LOOP_BEGIN_US]]
+; CHECK-NEXT: br label [[LOOP_BEGIN_US]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: loop_exit.split.us.loopexit:
; CHECK-NEXT: [[A_LCSSA_US_PH:%.*]] = phi i32 [ [[A_US]], [[LOOP_A_US]] ]
; CHECK-NEXT: br label [[LOOP_EXIT_SPLIT_US]]
@@ -682,7 +682,7 @@ define i32 @test10b(ptr %ptr, i1 %cond, ptr %a.ptr) {
; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1
; CHECK-NEXT: br i1 [[V2_US]], label [[LOOP_BEGIN_BACKEDGE_US]], label [[LOOP_EXIT_SPLIT_US:%.*]]
; CHECK: loop_begin.backedge.us:
-; CHECK-NEXT: br label [[LOOP_BEGIN_US]]
+; CHECK-NEXT: br label [[LOOP_BEGIN_US]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: loop_exit.split.us:
; CHECK-NEXT: [[A_LCSSA_US:%.*]] = phi i32 [ [[A_US]], [[LOOP_A_US]] ]
; CHECK-NEXT: br label [[LOOP_EXIT:%.*]]
@@ -844,7 +844,7 @@ define i32 @test11b(ptr %ptr, ptr %cond.ptr, ptr %a.ptr, ptr %b.ptr) {
; CHECK-NEXT: br label [[INNER_LOOP_A_US:%.*]]
; CHECK: inner_loop_a.us:
; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1
-; CHECK-NEXT: br i1 [[V2_US]], label [[INNER_LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_LOOP_BEGIN_US]]
+; CHECK-NEXT: br i1 [[V2_US]], label [[INNER_LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_LOOP_BEGIN_US]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: inner_loop_exit.split.us:
; CHECK-NEXT: [[A_INNER_LCSSA_US:%.*]] = phi i32 [ [[A_US]], [[INNER_LOOP_A_US]] ]
; CHECK-NEXT: br label [[INNER_LOOP_EXIT:%.*]]
@@ -1033,7 +1033,7 @@ define i32 @test12b(ptr %ptr, ptr %cond.ptr, ptr %a.ptr, ptr %b.ptr) {
; CHECK-NEXT: br label [[INNER_INNER_LOOP_A_US:%.*]]
; CHECK: inner_inner_loop_a.us:
; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1
-; CHECK-NEXT: br i1 [[V2_US]], label [[INNER_INNER_LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_BEGIN_US]]
+; CHECK-NEXT: br i1 [[V2_US]], label [[INNER_INNER_LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_BEGIN_US]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: inner_inner_loop_exit.split.us:
; CHECK-NEXT: [[A_INNER_INNER_LCSSA_US:%.*]] = phi i32 [ [[A_US]], [[INNER_INNER_LOOP_A_US]] ]
; CHECK-NEXT: br label [[INNER_INNER_LOOP_EXIT:%.*]]
@@ -1142,7 +1142,7 @@ define i32 @test13a(ptr %ptr, i1 %cond, ptr %a.ptr, ptr %b.ptr) {
; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1
; CHECK-NEXT: br i1 [[V2_US]], label [[LOOP_EXIT_SPLIT_US:%.*]], label [[LOOP_LATCH_US]]
; CHECK: loop_latch.us:
-; CHECK-NEXT: br label [[LOOP_BEGIN_US]]
+; CHECK-NEXT: br label [[LOOP_BEGIN_US]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: loop_exit.split.us:
; CHECK-NEXT: [[LCSSA_US:%.*]] = phi i32 [ [[A_US]], [[LOOP_A_US]] ]
; CHECK-NEXT: br label [[LOOP_EXIT:%.*]]
@@ -1237,7 +1237,7 @@ define i32 @test13b(ptr %ptr, i1 %cond, ptr %a.ptr, ptr %b.ptr) {
; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1
; CHECK-NEXT: br i1 [[V2_US]], label [[LOOP_EXIT_SPLIT_US_LOOPEXIT:%.*]], label [[LOOP_LATCH_US:%.*]]
; CHECK: loop_latch.us:
-; CHECK-NEXT: br label [[LOOP_BEGIN_US]]
+; CHECK-NEXT: br label [[LOOP_BEGIN_US]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: loop_exit.split.us.loopexit:
; CHECK-NEXT: [[LCSSA_US_PH:%.*]] = phi i32 [ [[A_US]], [[LOOP_A_US]] ]
; CHECK-NEXT: br label [[LOOP_EXIT_SPLIT_US]]
@@ -1356,7 +1356,7 @@ define void @test23(i1 %arg, ptr %ptr) {
; CHECK-NEXT: br label [[OUTER_LATCH_US:%.*]]
; CHECK: outer.latch.us:
; CHECK-NEXT: [[OUTER_COND_US:%.*]] = load i1, ptr [[PTR]], align 1
-; CHECK-NEXT: br i1 [[OUTER_COND_US]], label [[OUTER_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[OUTER_COND_US]], label [[OUTER_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: entry.split:
@@ -1426,10 +1426,10 @@ define i32 @test29(i32 %arg) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARG_FR:%.*]] = freeze i32 [[ARG:%.*]]
; CHECK-NEXT: switch i32 [[ARG_FR]], label [[ENTRY_SPLIT:%.*]] [
-; CHECK-NEXT: i32 0, label [[ENTRY_SPLIT_US:%.*]]
-; CHECK-NEXT: i32 1, label [[ENTRY_SPLIT_US]]
-; CHECK-NEXT: i32 2, label [[ENTRY_SPLIT_US1:%.*]]
-; CHECK-NEXT: i32 3, label [[ENTRY_SPLIT]]
+; CHECK-NEXT: i32 0, label [[ENTRY_SPLIT_US:%.*]]
+; CHECK-NEXT: i32 1, label [[ENTRY_SPLIT_US]]
+; CHECK-NEXT: i32 2, label [[ENTRY_SPLIT_US1:%.*]]
+; CHECK-NEXT: i32 3, label [[ENTRY_SPLIT]]
; CHECK-NEXT: ]
; CHECK: entry.split.us:
; CHECK-NEXT: br label [[HEADER_US:%.*]]
@@ -1456,7 +1456,7 @@ define i32 @test29(i32 %arg) {
; CHECK-NEXT: br label [[LATCH_US:%.*]]
; CHECK: latch.us:
; CHECK-NEXT: [[CMP2_US:%.*]] = icmp slt i32 [[TMP_C_SUM_US]], 42
-; CHECK-NEXT: br i1 [[CMP2_US]], label [[HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[CMP2_US]], label [[HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: [[LCSSA_PHI_US:%.*]] = phi i32 [ [[TMP_C_SUM_US]], [[LATCH_US]] ]
; CHECK-NEXT: br label [[EXIT:%.*]]
@@ -1485,7 +1485,7 @@ define i32 @test29(i32 %arg) {
; CHECK-NEXT: br label [[LATCH_US18:%.*]]
; CHECK: latch.us18:
; CHECK-NEXT: [[CMP2_US19:%.*]] = icmp slt i32 [[TMP_C_SUM_US17]], 42
-; CHECK-NEXT: br i1 [[CMP2_US19]], label [[HEADER_US2]], label [[EXIT_SPLIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[CMP2_US19]], label [[HEADER_US2]], label [[EXIT_SPLIT_SPLIT_US:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: exit.split.split.us:
; CHECK-NEXT: [[LCSSA_PHI_US20:%.*]] = phi i32 [ [[TMP_C_SUM_US17]], [[LATCH_US18]] ]
; CHECK-NEXT: br label [[EXIT_SPLIT:%.*]]
@@ -1587,10 +1587,10 @@ define i32 @test30(i32 %arg) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARG_FR:%.*]] = freeze i32 [[ARG:%.*]]
; CHECK-NEXT: switch i32 [[ARG_FR]], label [[ENTRY_SPLIT:%.*]] [
-; CHECK-NEXT: i32 -1, label [[ENTRY_SPLIT]]
-; CHECK-NEXT: i32 0, label [[ENTRY_SPLIT_US:%.*]]
-; CHECK-NEXT: i32 1, label [[ENTRY_SPLIT_US1:%.*]]
-; CHECK-NEXT: i32 2, label [[ENTRY_SPLIT_US1]]
+; CHECK-NEXT: i32 -1, label [[ENTRY_SPLIT]]
+; CHECK-NEXT: i32 0, label [[ENTRY_SPLIT_US:%.*]]
+; CHECK-NEXT: i32 1, label [[ENTRY_SPLIT_US1:%.*]]
+; CHECK-NEXT: i32 2, label [[ENTRY_SPLIT_US1]]
; CHECK-NEXT: ]
; CHECK: entry.split.us:
; CHECK-NEXT: br label [[HEADER_US:%.*]]
@@ -1612,7 +1612,7 @@ define i32 @test30(i32 %arg) {
; CHECK-NEXT: br label [[LATCH_US:%.*]]
; CHECK: latch.us:
; CHECK-NEXT: [[CMP2_US:%.*]] = icmp slt i32 [[TMP_B_SUM_US]], 42
-; CHECK-NEXT: br i1 [[CMP2_US]], label [[HEADER_US]], label [[LOOP_EXIT2_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[CMP2_US]], label [[HEADER_US]], label [[LOOP_EXIT2_SPLIT_US:%.*]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: loop.exit2.split.us:
; CHECK-NEXT: [[L2_PHI_US:%.*]] = phi i32 [ [[TMP_B_SUM_US]], [[LATCH_US]] ]
; CHECK-NEXT: br label [[LOOP_EXIT2:%.*]]
@@ -1636,7 +1636,7 @@ define i32 @test30(i32 %arg) {
; CHECK-NEXT: br label [[LATCH_US14:%.*]]
; CHECK: latch.us14:
; CHECK-NEXT: [[CMP2_US15:%.*]] = icmp slt i32 [[TMP_B_SUM_US13]], 42
-; CHECK-NEXT: br i1 [[CMP2_US15]], label [[HEADER_US2]], label [[LOOP_EXIT2_SPLIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[CMP2_US15]], label [[HEADER_US2]], label [[LOOP_EXIT2_SPLIT_SPLIT_US:%.*]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: loop.exit2.split.split.us:
; CHECK-NEXT: [[L2_PHI_US16:%.*]] = phi i32 [ [[TMP_B_SUM_US13]], [[LATCH_US14]] ]
; CHECK-NEXT: br label [[LOOP_EXIT2_SPLIT:%.*]]
@@ -2259,9 +2259,9 @@ define void @hoist_inner_loop_switch(ptr %ptr) {
; CHECK-NEXT: [[V1:%.*]] = call i32 @cond.i32()
; CHECK-NEXT: [[V1_FR:%.*]] = freeze i32 [[V1]]
; CHECK-NEXT: switch i32 [[V1_FR]], label [[B_HEADER_SPLIT:%.*]] [
-; CHECK-NEXT: i32 1, label [[B_HEADER_SPLIT_US:%.*]]
-; CHECK-NEXT: i32 2, label [[B_HEADER_SPLIT_US]]
-; CHECK-NEXT: i32 3, label [[B_HEADER_SPLIT_US]]
+; CHECK-NEXT: i32 1, label [[B_HEADER_SPLIT_US:%.*]]
+; CHECK-NEXT: i32 2, label [[B_HEADER_SPLIT_US]]
+; CHECK-NEXT: i32 3, label [[B_HEADER_SPLIT_US]]
; CHECK-NEXT: ]
; CHECK: b.header.split.us:
; CHECK-NEXT: br label [[C_HEADER_US:%.*]]
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-select.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-select.ll
index c86fa34..64b1829 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-select.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-select.ll
@@ -28,7 +28,7 @@ define i32 @basic(i32 %N, i1 %cond, i32 %select_input) {
; CHECK-NEXT: [[UNSWITCHED_SELECT_US:%.*]] = phi i32 [ [[SELECT_INPUT]], [[TMP0]] ]
; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 [[UNSWITCHED_SELECT_US]], [[RES_US]]
; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_US]], 1
-; CHECK-NEXT: br label [[FOR_COND_US]]
+; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: for.cond.cleanup.split.us:
; CHECK-NEXT: [[RES_LCSSA_US:%.*]] = phi i32 [ [[RES_US]], [[FOR_COND_US]] ]
; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
@@ -132,7 +132,7 @@ define i32 @select_phi_input(i32 %N, i1 %cond) {
; CHECK-NEXT: [[UNSWITCHED_SELECT_US:%.*]] = phi i32 [ [[I_US]], [[TMP0]] ]
; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 [[UNSWITCHED_SELECT_US]], [[RES_US]]
; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_US]], 1
-; CHECK-NEXT: br label [[FOR_COND_US]]
+; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: for.cond.cleanup.split.us:
; CHECK-NEXT: [[RES_LCSSA_US:%.*]] = phi i32 [ [[RES_US]], [[FOR_COND_US]] ]
; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
@@ -195,7 +195,7 @@ define i32 @basic_cond_noundef(i32 %N, i1 noundef %cond) {
; CHECK-NEXT: [[UNSWITCHED_SELECT_US:%.*]] = phi i32 [ [[I_US]], [[TMP0]] ]
; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 [[UNSWITCHED_SELECT_US]], [[RES_US]]
; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_US]], 1
-; CHECK-NEXT: br label [[FOR_COND_US]]
+; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.cond.cleanup.split.us:
; CHECK-NEXT: [[RES_LCSSA_US:%.*]] = phi i32 [ [[RES_US]], [[FOR_COND_US]] ]
; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
@@ -285,55 +285,24 @@ define i32 @chained_select(i32 %N, i1 %cond, i1 %cond2) {
; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[COND]]
; CHECK-NEXT: br i1 [[COND_FR]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
; CHECK: entry.split.us:
-; CHECK-NEXT: [[COND2_FR13:%.*]] = freeze i1 [[COND2]]
-; CHECK-NEXT: br i1 [[COND2_FR13]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]]
-; CHECK: entry.split.us.split.us:
-; CHECK-NEXT: br label [[FOR_COND_US_US:%.*]]
-; CHECK: for.cond.us.us:
-; CHECK-NEXT: [[RES_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[ADD_US_US:%.*]], [[TMP3:%.*]] ]
-; CHECK-NEXT: [[I_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[INC_US_US:%.*]], [[TMP3]] ]
-; CHECK-NEXT: [[CMP_US_US:%.*]] = icmp slt i32 [[I_US_US]], [[N]]
-; CHECK-NEXT: br i1 [[CMP_US_US]], label [[FOR_BODY_US_US:%.*]], label [[FOR_COND_CLEANUP_SPLIT_US_SPLIT_US:%.*]]
-; CHECK: for.body.us.us:
-; CHECK-NEXT: br label [[TMP0:%.*]]
-; CHECK: 0:
-; CHECK-NEXT: br label [[TMP1:%.*]]
-; CHECK: 1:
-; CHECK-NEXT: [[UNSWITCHED_SELECT_US_US:%.*]] = phi i32 [ [[I_US_US]], [[TMP0]] ]
-; CHECK-NEXT: br label [[TMP2:%.*]]
-; CHECK: 2:
-; CHECK-NEXT: br label [[TMP3]]
-; CHECK: 3:
-; CHECK-NEXT: [[UNSWITCHED_SELECT_US11:%.*]] = phi i32 [ [[UNSWITCHED_SELECT_US_US]], [[TMP2]] ]
-; CHECK-NEXT: [[ADD_US_US]] = add nuw nsw i32 [[UNSWITCHED_SELECT_US11]], [[RES_US_US]]
-; CHECK-NEXT: [[INC_US_US]] = add nuw nsw i32 [[I_US_US]], 1
-; CHECK-NEXT: br label [[FOR_COND_US_US]]
-; CHECK: for.cond.cleanup.split.us.split.us:
-; CHECK-NEXT: [[RES_LCSSA_US_US:%.*]] = phi i32 [ [[RES_US_US]], [[FOR_COND_US_US]] ]
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP_SPLIT_US:%.*]]
-; CHECK: entry.split.us.split:
; CHECK-NEXT: br label [[FOR_COND_US:%.*]]
; CHECK: for.cond.us:
-; CHECK-NEXT: [[RES_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT]] ], [ [[ADD_US:%.*]], [[TMP6:%.*]] ]
-; CHECK-NEXT: [[I_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT]] ], [ [[INC_US:%.*]], [[TMP6]] ]
+; CHECK-NEXT: [[RES_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[ADD_US:%.*]], [[TMP1:%.*]] ]
+; CHECK-NEXT: [[I_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[INC_US:%.*]], [[TMP1]] ]
; CHECK-NEXT: [[CMP_US:%.*]] = icmp slt i32 [[I_US]], [[N]]
-; CHECK-NEXT: br i1 [[CMP_US]], label [[FOR_BODY_US:%.*]], label [[FOR_COND_CLEANUP_SPLIT_US_SPLIT:%.*]]
+; CHECK-NEXT: br i1 [[CMP_US]], label [[FOR_BODY_US:%.*]], label [[FOR_COND_CLEANUP_SPLIT_US:%.*]]
; CHECK: for.body.us:
-; CHECK-NEXT: br label [[TMP4:%.*]]
-; CHECK: 4:
-; CHECK-NEXT: br label [[TMP5:%.*]]
-; CHECK: 5:
-; CHECK-NEXT: [[UNSWITCHED_SELECT_US:%.*]] = phi i32 [ [[I_US]], [[TMP4]] ]
-; CHECK-NEXT: br label [[TMP6]]
-; CHECK: 6:
-; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 24, [[RES_US]]
+; CHECK-NEXT: br label [[TMP0:%.*]]
+; CHECK: 0:
+; CHECK-NEXT: br label [[TMP1]]
+; CHECK: 1:
+; CHECK-NEXT: [[UNSWITCHED_SELECT_US:%.*]] = phi i32 [ [[I_US]], [[TMP0]] ]
+; CHECK-NEXT: [[SELECT2_US:%.*]] = select i1 [[COND2]], i32 [[UNSWITCHED_SELECT_US]], i32 24
+; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 [[SELECT2_US]], [[RES_US]]
; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_US]], 1
-; CHECK-NEXT: br label [[FOR_COND_US]]
-; CHECK: for.cond.cleanup.split.us.split:
-; CHECK-NEXT: [[RES_LCSSA_US:%.*]] = phi i32 [ [[RES_US]], [[FOR_COND_US]] ]
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP_SPLIT_US]]
+; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: for.cond.cleanup.split.us:
-; CHECK-NEXT: [[DOTUS_PHI12:%.*]] = phi i32 [ [[RES_LCSSA_US]], [[FOR_COND_CLEANUP_SPLIT_US_SPLIT]] ], [ [[RES_LCSSA_US_US]], [[FOR_COND_CLEANUP_SPLIT_US_SPLIT_US]] ]
+; CHECK-NEXT: [[RES_LCSSA_US:%.*]] = phi i32 [ [[RES_US]], [[FOR_COND_US]] ]
; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
; CHECK: entry.split:
; CHECK-NEXT: [[COND2_FR:%.*]] = freeze i1 [[COND2]]
@@ -341,36 +310,36 @@ define i32 @chained_select(i32 %N, i1 %cond, i1 %cond2) {
; CHECK: entry.split.split.us:
; CHECK-NEXT: br label [[FOR_COND_US1:%.*]]
; CHECK: for.cond.us1:
-; CHECK-NEXT: [[RES_US2:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT_US]] ], [ [[ADD_US7:%.*]], [[TMP9:%.*]] ]
-; CHECK-NEXT: [[I_US3:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT_US]] ], [ [[INC_US8:%.*]], [[TMP9]] ]
+; CHECK-NEXT: [[RES_US2:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT_US]] ], [ [[ADD_US7:%.*]], [[TMP4:%.*]] ]
+; CHECK-NEXT: [[I_US3:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT_US]] ], [ [[INC_US8:%.*]], [[TMP4]] ]
; CHECK-NEXT: [[CMP_US4:%.*]] = icmp slt i32 [[I_US3]], [[N]]
; CHECK-NEXT: br i1 [[CMP_US4]], label [[FOR_BODY_US5:%.*]], label [[FOR_COND_CLEANUP_SPLIT_SPLIT_US:%.*]]
; CHECK: for.body.us5:
-; CHECK-NEXT: br label [[TMP7:%.*]]
-; CHECK: 7:
-; CHECK-NEXT: br label [[TMP8:%.*]]
-; CHECK: 8:
-; CHECK-NEXT: br label [[TMP9]]
-; CHECK: 9:
-; CHECK-NEXT: [[UNSWITCHED_SELECT_US6:%.*]] = phi i32 [ 42, [[TMP8]] ]
+; CHECK-NEXT: br label [[TMP2:%.*]]
+; CHECK: 2:
+; CHECK-NEXT: br label [[TMP3:%.*]]
+; CHECK: 3:
+; CHECK-NEXT: br label [[TMP4]]
+; CHECK: 4:
+; CHECK-NEXT: [[UNSWITCHED_SELECT_US6:%.*]] = phi i32 [ 42, [[TMP3]] ]
; CHECK-NEXT: [[ADD_US7]] = add nuw nsw i32 [[UNSWITCHED_SELECT_US6]], [[RES_US2]]
; CHECK-NEXT: [[INC_US8]] = add nuw nsw i32 [[I_US3]], 1
-; CHECK-NEXT: br label [[FOR_COND_US1]]
+; CHECK-NEXT: br label [[FOR_COND_US1]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: for.cond.cleanup.split.split.us:
; CHECK-NEXT: [[RES_LCSSA_US9:%.*]] = phi i32 [ [[RES_US2]], [[FOR_COND_US1]] ]
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_SPLIT:%.*]]
; CHECK: entry.split.split:
; CHECK-NEXT: br label [[FOR_COND:%.*]]
; CHECK: for.cond:
-; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT]] ], [ [[ADD:%.*]], [[TMP11:%.*]] ]
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT]] ], [ [[INC:%.*]], [[TMP11]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT]] ], [ [[ADD:%.*]], [[TMP6:%.*]] ]
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT]] ], [ [[INC:%.*]], [[TMP6]] ]
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP_SPLIT_SPLIT:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: br label [[TMP10:%.*]]
-; CHECK: 10:
-; CHECK-NEXT: br label [[TMP11]]
-; CHECK: 11:
+; CHECK-NEXT: br label [[TMP5:%.*]]
+; CHECK: 5:
+; CHECK-NEXT: br label [[TMP6]]
+; CHECK: 6:
; CHECK-NEXT: [[ADD]] = add nuw nsw i32 24, [[RES]]
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1
; CHECK-NEXT: br label [[FOR_COND]]
@@ -381,7 +350,7 @@ define i32 @chained_select(i32 %N, i1 %cond, i1 %cond2) {
; CHECK-NEXT: [[DOTUS_PHI10:%.*]] = phi i32 [ [[RES_LCSSA]], [[FOR_COND_CLEANUP_SPLIT_SPLIT]] ], [ [[RES_LCSSA_US9]], [[FOR_COND_CLEANUP_SPLIT_SPLIT_US]] ]
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[DOTUS_PHI:%.*]] = phi i32 [ [[DOTUS_PHI10]], [[FOR_COND_CLEANUP_SPLIT]] ], [ [[DOTUS_PHI12]], [[FOR_COND_CLEANUP_SPLIT_US]] ]
+; CHECK-NEXT: [[DOTUS_PHI:%.*]] = phi i32 [ [[DOTUS_PHI10]], [[FOR_COND_CLEANUP_SPLIT]] ], [ [[RES_LCSSA_US]], [[FOR_COND_CLEANUP_SPLIT_US]] ]
; CHECK-NEXT: ret i32 [[DOTUS_PHI]]
;
entry:
@@ -427,7 +396,7 @@ define i32 @select_in_if(i32 %N, i1 %cond) {
; CHECK-NEXT: [[P_US:%.*]] = phi i32 [ [[UNSWITCHED_SELECT_US:%.*]], [[TMP1:%.*]] ], [ 24, [[FOR_BODY_US]] ]
; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 [[P_US]], [[RES_US]]
; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_US]], 1
-; CHECK-NEXT: br label [[FOR_COND_US]]
+; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: 0:
; CHECK-NEXT: br label [[TMP1]]
; CHECK: 1:
@@ -517,7 +486,7 @@ define i32 @select_in_if_else(i32 %N, i1 %cond) {
; CHECK-NEXT: [[P_US:%.*]] = phi i32 [ [[COND1A_US]], [[FOR_BODY_IF_US]] ], [ [[UNSWITCHED_SELECT_US:%.*]], [[TMP1:%.*]] ]
; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 [[P_US]], [[RES_US]]
; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_US]], 1
-; CHECK-NEXT: br label [[FOR_COND_US]]
+; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: 0:
; CHECK-NEXT: br label [[TMP1]]
; CHECK: 1:
@@ -606,7 +575,7 @@ define dso_local void @select_nested_loop(i1 noundef zeroext %cond, i32 noundef
; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.us:
; CHECK-NEXT: [[INC7_US_US]] = add nuw i32 [[I_018_US_US]], 1
; CHECK-NEXT: [[EXITCOND21_NOT_US:%.*]] = icmp eq i32 [[INC7_US_US]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND21_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_US_US]]
+; CHECK-NEXT: br i1 [[EXITCOND21_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_US_US]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: for.cond1.preheader.us.split.us.us:
; CHECK-NEXT: br label [[FOR_BODY4_US_US_US:%.*]]
; CHECK: for.body4.us.us.us:
@@ -619,7 +588,7 @@ define dso_local void @select_nested_loop(i1 noundef zeroext %cond, i32 noundef
; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US_US]])
; CHECK-NEXT: [[INC_US_US_US]] = add nuw i32 [[J_016_US_US_US]], 1
; CHECK-NEXT: [[EXITCOND_NOT_US_US:%.*]] = icmp eq i32 [[INC_US_US_US]], [[M]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT_US_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_SPLIT_US_US:%.*]], label [[FOR_BODY4_US_US_US]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_US_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_SPLIT_US_US:%.*]], label [[FOR_BODY4_US_US_US]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.split.us.us:
; CHECK-NEXT: br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_US]]
; CHECK: for.cond.cleanup.loopexit.split.us:
@@ -707,7 +676,7 @@ define dso_local void @select_invariant_outer_loop(i1 noundef zeroext %cond, i32
; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US]])
; CHECK-NEXT: [[INC_US_US]] = add nuw i32 [[J_019_US_US]], 1
; CHECK-NEXT: [[EXITCOND_NOT_US:%.*]] = icmp eq i32 [[INC_US_US]], [[M]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_SPLIT_US:%.*]], label [[FOR_BODY4_US_US]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_SPLIT_US:%.*]], label [[FOR_BODY4_US_US]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.split.us:
; CHECK-NEXT: br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
; CHECK: for.cond1.preheader.us.split:
@@ -782,7 +751,7 @@ define dso_local i32 @trivial_select_cond(i32 noundef %n, i32 noundef %a, i32 no
; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US]])
; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_03_US]], 1
; CHECK-NEXT: [[EXITCOND_NOT_US:%.*]] = icmp eq i32 [[INC_US]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: for.cond.cleanup.loopexit.split.us:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
; CHECK: for.body.preheader.split:
@@ -839,7 +808,7 @@ define i32 @and_lhs_invariant(i32 %num, i1 %cond) {
; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US]])
; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_07_US]], 1
; CHECK-NEXT: [[EXITCOND_NOT_US:%.*]] = icmp eq i32 [[INC_US]], [[NUM]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: for.cond.cleanup.loopexit.split.us:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
; CHECK: for.body.preheader.split:
@@ -904,7 +873,7 @@ define i32 @and_rhs_invariant(i32 %num, i1 %cond) {
; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US]])
; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_07_US]], 1
; CHECK-NEXT: [[EXITCOND_NOT_US:%.*]] = icmp eq i32 [[INC_US]], [[NUM]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: for.cond.cleanup.loopexit.split.us:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
; CHECK: for.body.preheader.split:
@@ -971,7 +940,7 @@ define i32 @or_lhs_invariant(i32 %num, i1 %cond) {
; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US]])
; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_07_US]], 1
; CHECK-NEXT: [[EXITCOND_NOT_US:%.*]] = icmp eq i32 [[INC_US]], [[NUM]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: for.cond.cleanup.loopexit.split.us:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
; CHECK: for.body.preheader.split:
@@ -1038,7 +1007,7 @@ define i32 @or_rhs_invariant(i32 %num, i1 %cond) {
; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US]])
; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_07_US]], 1
; CHECK-NEXT: [[EXITCOND_NOT_US:%.*]] = icmp eq i32 [[INC_US]], [[NUM]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: for.cond.cleanup.loopexit.split.us:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
; CHECK: for.body.preheader.split:
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
index 9567b6b..36f7a9e 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
@@ -2626,66 +2626,45 @@ loop_a:
; The second unswitched condition.
;
; CHECK: entry.split.us:
-; CHECK-NEXT: br i1 %cond2, label %entry.split.us.split.us, label %entry.split.us.split
+; CHECK-NEXT: br label %loop_begin.us
loop_a_a:
call i32 @a()
br label %latch
; The 'loop_a_a' unswitched loop.
;
-; CHECK: entry.split.us.split.us:
-; CHECK-NEXT: br label %loop_begin.us.us
-;
-; CHECK: loop_begin.us.us:
-; CHECK-NEXT: br label %loop_a.us.us
-;
-; CHECK: loop_a.us.us:
-; CHECK-NEXT: br label %loop_a_a.us.us
-;
-; CHECK: loop_a_a.us.us:
-; CHECK-NEXT: call i32 @a()
-; CHECK-NEXT: br label %latch.us.us
-;
-; CHECK: latch.us.us:
-; CHECK-NEXT: %[[V:.*]] = load i1, ptr %ptr
-; CHECK-NEXT: br i1 %[[V]], label %loop_begin.us.us, label %loop_exit.split.us.split.us
-;
-; CHECK: loop_exit.split.us.split.us:
-; CHECK-NEXT: br label %loop_exit.split
-
-loop_a_c:
- call i32 @c()
- br label %latch
-; The 'loop_a_c' unswitched loop.
-;
-; CHECK: entry.split.us.split:
-; CHECK-NEXT: br label %loop_begin.us
-;
; CHECK: loop_begin.us:
; CHECK-NEXT: br label %loop_a.us
;
; CHECK: loop_a.us:
-; CHECK-NEXT: br label %loop_a_c.us
+; CHECK-NEXT: br i1 %cond2, label %loop_a_a.us, label %loop_a_c.us
+;
+; The 'loop_a_c' unswitched loop.
;
; CHECK: loop_a_c.us:
; CHECK-NEXT: call i32 @c()
-; CHECK-NEXT: br label %latch
+; CHECK-NEXT: br label %latch.us
+;
+; CHECK: loop_a_a.us:
+; CHECK-NEXT: call i32 @a()
+; CHECK-NEXT: br label %latch.us
;
; CHECK: latch.us:
; CHECK-NEXT: %[[V:.*]] = load i1, ptr %ptr
-; CHECK-NEXT: br i1 %[[V]], label %loop_begin.us, label %loop_exit.split.us.split
+; CHECK-NEXT: br i1 %[[V]], label %loop_begin.us, label %loop_exit.split.us, !llvm.loop !22
;
-; CHECK: loop_exit.split.us.split:
-; CHECK-NEXT: br label %loop_exit.split
+; CHECK: loop_exit.split.us
+; CHECK-NEXT: br label %loop_exit
+
+loop_a_c:
+ call i32 @c()
+ br label %latch
loop_b:
call i32 @b()
br label %latch
; The 'loop_b' unswitched loop.
;
-; CHECK: entry.split:
-; CHECK-NEXT: br label %loop_begin
-;
; CHECK: loop_begin:
; CHECK-NEXT: br label %loop_b
;
@@ -2985,9 +2964,9 @@ loop_a:
;
; CHECK: [[LOOP_LATCH_A]]:
; CHECK-NEXT: %[[V_A:.*]] = load i1, ptr %ptr
-; CHECK: br i1 %[[V_A]], label %[[LOOP_BEGIN_A]], label %[[LOOP_EXIT_A:.*]]
+; CHECK: br i1 %[[V_A]], label %loop_begin.us, label %loop_exit.split.us, !llvm.loop !26
;
-; CHECK: [[LOOP_EXIT_A]]:
+; CHECK: loop_exit.split.us:
; CHECK-NEXT: br label %loop_exit
loop_b:
@@ -3007,10 +2986,10 @@ loop_b:
;
; CHECK: [[LOOP_LATCH_B]]:
; CHECK-NEXT: %[[V_B:.*]] = load i1, ptr %ptr
-; CHECK: br i1 %[[V_B]], label %[[LOOP_BEGIN_B]], label %[[LOOP_EXIT_B:.*]]
+; CHECK: br i1 %[[V_B]], label %loop_begin.us2, label %loop_exit.split.split.us, !llvm.loop !27
;
-; CHECK: [[LOOP_EXIT_B]]:
-; CHECK-NEXT: br label %loop_exit
+; CHECK: loop_exit.split.split.us:
+; CHECK-NEXT: br label %loop_exit.split
loop_c:
call i32 @c()
@@ -3029,10 +3008,10 @@ loop_c:
;
; CHECK: [[LOOP_LATCH_C]]:
; CHECK-NEXT: %[[V_C:.*]] = load i1, ptr %ptr
-; CHECK: br i1 %[[V_C]], label %[[LOOP_BEGIN_C]], label %[[LOOP_EXIT_C:.*]]
+; CHECK: br i1 %[[V_C]], label %loop_begin.us6, label %loop_exit.split.split.split.us, !llvm.loop !28
;
-; CHECK: [[LOOP_EXIT_C]]:
-; CHECK-NEXT: br label %loop_exit
+; CHECK: loop_exit.split.split.split.us:
+; CHECK-NEXT: br label %loop_exit.split.split
latch:
%v = load i1, ptr %ptr
@@ -3132,9 +3111,9 @@ body.a:
;
; CHECK: [[LATCH_A]]:
; CHECK-NEXT: %[[CMP2_A:.*]] = icmp slt i32 %[[TMP_C_SUM_A]], 42
-; CHECK: br i1 %[[CMP2_A]], label %[[HEADER_A]], label %[[LOOP_EXIT_A:.*]]
+; CHECK: br i1 %[[CMP2_A]], label %header.us, label %exit.split.us, !llvm.loop !29
;
-; CHECK: [[LOOP_EXIT_A]]:
+; CHECK: exit.split.us:
; CHECK-NEXT: %[[LCSSA_A:.*]] = phi i32 [ %[[TMP_C_SUM_A]], %[[LATCH_A]] ]
; CHECK-NEXT: br label %exit
@@ -3176,9 +3155,9 @@ body.b:
;
; CHECK: [[LATCH_B]]:
; CHECK-NEXT: %[[CMP2_B:.*]] = icmp slt i32 %[[TMP_C_SUM_B]], 42
-; CHECK: br i1 %[[CMP2_B]], label %[[HEADER_B]], label %[[LOOP_EXIT_B:.*]]
+; CHECK: br i1 %[[CMP2_B]], label %header.us2, label %exit.split.split.us, !llvm.loop !30
;
-; CHECK: [[LOOP_EXIT_B]]:
+; CHECK: exit.split.split.us:
; CHECK-NEXT: %[[LCSSA_B:.*]] = phi i32 [ %[[TMP_C_SUM_B]], %[[LATCH_B]] ]
; CHECK-NEXT: br label %[[EXIT_SPLIT:.*]]
@@ -3234,11 +3213,11 @@ exit:
%lcssa.phi = phi i32 [ %tmp.c.sum, %latch ]
ret i32 %lcssa.phi
; CHECK: [[EXIT_SPLIT]]:
-; CHECK-NEXT: %[[EXIT_PHI1:.*]] = phi i32 [ %[[LCSSA_C]], %[[LOOP_EXIT_C]] ], [ %[[LCSSA_B]], %[[LOOP_EXIT_B]] ]
+; CHECK-NEXT: %[[EXIT_PHI1:.*]] = phi i32 [ %[[LCSSA_C]], %[[LOOP_EXIT_C]] ], [ %[[LCSSA_B]], %exit.split.split.us ]
; CHECK-NEXT: br label %exit
; CHECK: exit:
-; CHECK-NEXT: %[[EXIT_PHI2:.*]] = phi i32 [ %[[EXIT_PHI1]], %[[EXIT_SPLIT]] ], [ %[[LCSSA_A]], %[[LOOP_EXIT_A]] ]
+; CHECK-NEXT: %[[EXIT_PHI2:.*]] = phi i32 [ %[[EXIT_PHI1]], %[[EXIT_SPLIT]] ], [ %[[LCSSA_A]], %exit.split.us ]
; CHECK-NEXT: ret i32 %[[EXIT_PHI2]]
}
@@ -3304,9 +3283,9 @@ body.a:
;
; CHECK: [[LATCH_A]]:
; CHECK-NEXT: %[[CMP2_A:.*]] = icmp slt i32 %[[TMP_B_SUM_A]], 42
-; CHECK: br i1 %[[CMP2_A]], label %[[HEADER_A]], label %[[LOOP_EXIT_A:.*]]
+; CHECK: br i1 %[[CMP2_A]], label %header.us, label %loop.exit2.split.us, !llvm.loop !31
;
-; CHECK: [[LOOP_EXIT_A]]:
+; CHECK: loop.exit2.split.us:
; CHECK-NEXT: %[[LCSSA_A:.*]] = phi i32 [ %[[TMP_B_SUM_A]], %[[LATCH_A]] ]
; CHECK-NEXT: br label %loop.exit2
@@ -3342,9 +3321,9 @@ body.b:
;
; CHECK: [[LATCH_B]]:
; CHECK-NEXT: %[[CMP2_B:.*]] = icmp slt i32 %[[TMP_B_SUM_B]], 42
-; CHECK: br i1 %[[CMP2_B]], label %[[HEADER_B]], label %[[LOOP_EXIT_B:.*]]
+; CHECK: br i1 %[[CMP2_B]], label %header.us2, label %loop.exit2.split.split.us, !llvm.loop !32
;
-; CHECK: [[LOOP_EXIT_B]]:
+; CHECK: loop.exit2.split.split.us:
; CHECK-NEXT: %[[LCSSA_B:.*]] = phi i32 [ %[[TMP_B_SUM_B]], %[[LATCH_B]] ]
; CHECK-NEXT: br label %[[LOOP_EXIT2_SPLIT:.*]]
@@ -3397,11 +3376,11 @@ loop.exit2:
%l2.phi = phi i32 [ %tmp.b.sum, %latch ]
br label %exit
; CHECK: [[LOOP_EXIT2_SPLIT]]:
-; CHECK-NEXT: %[[LOOP_EXIT_PHI1:.*]] = phi i32 [ %[[L2_PHI]], %[[LOOP_EXIT_EXIT]] ], [ %[[LCSSA_B]], %[[LOOP_EXIT_B]] ]
+; CHECK-NEXT: %[[LOOP_EXIT_PHI1:.*]] = phi i32 [ %[[L2_PHI]], %[[LOOP_EXIT_EXIT]] ], [ %[[LCSSA_B]], %loop.exit2.split.split.us ]
; CHECK-NEXT: br label %loop.exit2
;
; CHECK: loop.exit2:
-; CHECK-NEXT: %[[LOOP_EXIT_PHI2:.*]] = phi i32 [ %[[LOOP_EXIT_PHI1]], %[[LOOP_EXIT2_SPLIT]] ], [ %[[LCSSA_A]], %[[LOOP_EXIT_A]] ]
+; CHECK-NEXT: %[[LOOP_EXIT_PHI2:.*]] = phi i32 [ %[[LOOP_EXIT_PHI1]], %[[LOOP_EXIT2_SPLIT]] ], [ %[[LCSSA_A]], %loop.exit2.split.us ]
; CHECK-NEXT: br label %exit
exit:
@@ -4058,9 +4037,7 @@ entry:
; CHECK-NEXT: ]
;
; CHECK: [[ENTRY_SPLIT_US]]:
-; CHECK-NEXT: switch i32 %arg, label %[[ENTRY_SPLIT_US_SPLIT:.*]] [
-; CHECK-NEXT: i32 1, label %[[ENTRY_SPLIT_US_SPLIT_US:.*]]
-; CHECK-NEXT: ]
+; CHECK-NEXT: br label %outer.header.us
outer.header:
br label %inner.header
@@ -4074,66 +4051,13 @@ inner.header:
inner.body1:
%a = call i32 @a()
br label %inner.latch
-; The (super convoluted) fully unswitched loop around `@a`.
-;
-; CHECK: [[ENTRY_SPLIT_US_SPLIT_US]]:
-; CHECK-NEXT: br label %[[OUTER_HEADER_US_US:.*]]
-;
-; CHECK: [[OUTER_HEADER_US_US]]:
-; CHECK-NEXT: br label %[[OUTER_HEADER_SPLIT_US_US:.*]]
-;
-; CHECK: [[OUTER_LATCH_US_US:.*]]:
-; CHECK-NEXT: %[[OUTER_COND_US_US:.*]] = call i1 @cond()
-; CHECK-NEXT: br i1 %[[OUTER_COND_US_US]], label %[[OUTER_HEADER_US_US]], label %[[EXIT_SPLIT_US_SPLIT_US:.*]]
-;
-; CHECK: [[OUTER_HEADER_SPLIT_US_US]]:
-; CHECK-NEXT: br label %[[OUTER_HEADER_SPLIT_SPLIT_US_US_US:.*]]
-;
-; CHECK: [[INNER_LOOPEXIT2_US_US:.*]]:
-; CHECK-NEXT: br label %[[OUTER_LATCH_US_US]]
-;
-; CHECK: [[OUTER_HEADER_SPLIT_SPLIT_US_US_US]]:
-; CHECK-NEXT: br label %[[INNER_HEADER_US_US_US:.*]]
-;
-; CHECK: [[INNER_HEADER_US_US_US]]:
-; CHECK-NEXT: br label %[[INNER_BODY1_US_US_US:.*]]
-;
-; CHECK: [[INNER_BODY1_US_US_US]]:
-; CHECK-NEXT: %[[A:.*]] = call i32 @a()
-; CHECK-NEXT: br label %[[INNER_LATCH_US_US_US:.*]]
-;
-; CHECK: [[INNER_LATCH_US_US_US]]:
-; CHECK-NEXT: %[[PHI_A:.*]] = phi i32 [ %[[A]], %[[INNER_BODY1_US_US_US]] ]
-; CHECK-NEXT: call void @sink1(i32 0)
-; CHECK-NEXT: call void @sink1(i32 0)
-; CHECK-NEXT: call void @sink1(i32 0)
-; CHECK-NEXT: call void @sink1(i32 0)
-; CHECK-NEXT: call void @sink1(i32 0)
-; CHECK-NEXT: call void @sink1(i32 0)
-; CHECK-NEXT: call void @sink1(i32 0)
-; CHECK-NEXT: call void @sink1(i32 0)
-; CHECK-NEXT: call void @sink1(i32 0)
-; CHECK-NEXT: call void @sink1(i32 0)
-; CHECK-NEXT: call void @sink1(i32 %[[PHI_A]])
-; CHECK-NEXT: %[[INNER_COND_US_US_US:.*]] = call i1 @cond()
-; CHECK-NEXT: br i1 %[[INNER_COND_US_US_US]], label %[[INNER_HEADER_US_US_US]], label %[[INNER_LOOPEXIT2_SPLIT_US_US_US:.*]]
-;
-; CHECK: [[INNER_LOOPEXIT2_SPLIT_US_US_US]]:
-; CHECK-NEXT: br label %[[INNER_LOOPEXIT2_US_US]]
-;
-; CHECK: [[EXIT_SPLIT_US_SPLIT_US]]:
-; CHECK-NEXT: br label %[[EXIT_SPLIT_US:.*]]
-
inner.body2:
%b = call i32 @b()
br label %inner.latch
; The fully unswitched loop around `@b`.
;
-; CHECK: [[ENTRY_SPLIT_US_SPLIT]]:
-; CHECK-NEXT: br label %[[OUTER_HEADER_US:.*]]
-;
-; CHECK: [[OUTER_HEADER_US]]:
+; CHECK: outer.header.us:
; CHECK-NEXT: br label %[[OUTER_HEADER_SPLIT_US:.*]]
;
; CHECK: [[INNER_HEADER_US:.*]]:
@@ -4163,18 +4087,51 @@ inner.body2:
;
; CHECK: [[OUTER_LATCH_US:.*]]:
; CHECK-NEXT: %[[OUTER_COND_US:.*]] = call i1 @cond()
-; CHECK-NEXT: br i1 %[[OUTER_COND_US]], label %[[OUTER_HEADER_US]], label %[[EXIT_SPLIT_US_SPLIT:.*]]
+; CHECK-NEXT: br i1 %[[OUTER_COND_US]], label %outer.header.us, label %exit.split.us, !llvm.loop !33
;
; CHECK: [[OUTER_HEADER_SPLIT_US]]:
-; CHECK-NEXT: br label %[[OUTER_HEADER_SPLIT_SPLIT_US:.*]]
+; CHECK-NEXT: switch i32 %arg, label %outer.header.split.split.us5 [
+; CHECK-NEXT: i32 1, label %outer.header.split.split.us.us
+; CHECK-NEXT: ]
;
-; CHECK: [[OUTER_HEADER_SPLIT_SPLIT_US]]:
+; CHECK: outer.header.split.split.us5:
; CHECK-NEXT: br label %[[INNER_HEADER_US]]
;
; CHECK: [[INNER_LOOPEXIT2_US]]:
; CHECK-NEXT: br label %[[OUTER_LATCH_US]]
+
+; The (super convoluted) fully unswitched loop around `@a`.
+;
+; CHECK: outer.header.split.split.us.us:
+; CHECK-NEXT: br label %[[INNER_HEADER_US_US:.*]]
+;
+; CHECK: [[INNER_HEADER_US_US]]:
+; CHECK-NEXT: br label %[[INNER_BODY1_US_US:.*]]
+;
+; CHECK: [[INNER_BODY1_US_US]]:
+; CHECK-NEXT: %[[A:.*]] = call i32 @a()
+; CHECK-NEXT: br label %[[INNER_LATCH_US_US:.*]]
+;
+; CHECK: [[INNER_LATCH_US_US]]:
+; CHECK-NEXT: %[[PHI_A:.*]] = phi i32 [ %[[A]], %[[INNER_BODY1_US_US]] ]
+; CHECK-NEXT: call void @sink1(i32 0)
+; CHECK-NEXT: call void @sink1(i32 0)
+; CHECK-NEXT: call void @sink1(i32 0)
+; CHECK-NEXT: call void @sink1(i32 0)
+; CHECK-NEXT: call void @sink1(i32 0)
+; CHECK-NEXT: call void @sink1(i32 0)
+; CHECK-NEXT: call void @sink1(i32 0)
+; CHECK-NEXT: call void @sink1(i32 0)
+; CHECK-NEXT: call void @sink1(i32 0)
+; CHECK-NEXT: call void @sink1(i32 0)
+; CHECK-NEXT: call void @sink1(i32 %[[PHI_A]])
+; CHECK-NEXT: %[[INNER_COND_US_US:.*]] = call i1 @cond()
+; CHECK-NEXT: br i1 %[[INNER_COND_US_US]], label %[[INNER_HEADER_US_US]], label %[[INNER_LOOPEXIT2_SPLIT_US_US:.*]], !llvm.loop !34
+;
+; CHECK: [[INNER_LOOPEXIT2_SPLIT_US_US]]:
+; CHECK-NEXT: br label %[[INNER_LOOPEXIT2_US]]
;
-; CHECK: [[EXIT_SPLIT_US]]:
+; CHECK: exit.split.us:
; CHECK-NEXT: br label %exit
inner.latch:
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch-loop-and-block-dispositions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch-loop-and-block-dispositions.ll
index a169aa4..e821dfc 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch-loop-and-block-dispositions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch-loop-and-block-dispositions.ll
@@ -11,59 +11,43 @@ define void @test_pr58564(i16 %a, i1 %c.1, ptr %dst) {
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i16 [[A:%.*]], -6
; CHECK-NEXT: br i1 [[TMP0]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
; CHECK: entry.split.us:
-; CHECK-NEXT: br i1 [[C_1:%.*]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]]
-; CHECK: entry.split.us.split.us:
-; CHECK-NEXT: br label [[LOOP_1_HEADER_US_US:%.*]]
-; CHECK: loop.1.header.us.us:
-; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_US_US:%.*]]
-; CHECK: loop.1.header.split.us.us.us:
-; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US_SPLIT_US_SPLIT_US:%.*]]
-; CHECK: loop.1.header.split.us.split.us.split.us.split.us:
-; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US_SPLIT_US:%.*]]
-; CHECK: entry.split.us.split:
; CHECK-NEXT: br label [[LOOP_1_HEADER_US:%.*]]
; CHECK: loop.1.header.us:
; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_US:%.*]]
-; CHECK: loop.4.header.us5:
+; CHECK: loop.4.header.us2:
; CHECK-NEXT: br label [[LOOP_5_US6:%.*]]
-; CHECK: loop.5.us6:
+; CHECK: loop.5.us3:
; CHECK-NEXT: [[IV_US7:%.*]] = phi i16 [ 0, [[LOOP_4_HEADER_US5:%.*]] ], [ [[IV_NEXT_US9:%.*]], [[LOOP_5_US6]] ]
; CHECK-NEXT: [[GEP_US8:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i16 [[IV_US7]]
; CHECK-NEXT: store ptr null, ptr [[GEP_US8]], align 8
; CHECK-NEXT: [[IV_NEXT_US9]] = add nuw nsw i16 [[IV_US7]], 1
; CHECK-NEXT: [[EC_US10:%.*]] = icmp ne i16 [[IV_US7]], 10000
-; CHECK-NEXT: br i1 [[EC_US10]], label [[LOOP_5_US6]], label [[LOOP_4_LATCH_US11:%.*]]
-; CHECK: loop.4.latch.us11:
+; CHECK-NEXT: br i1 [[EC_US10]], label [[LOOP_5_US6]], label [[LOOP_4_LATCH_US8:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: loop.4.latch.us8:
; CHECK-NEXT: br label [[LOOP_1_LATCH_US:%.*]]
; CHECK: loop.1.latch.us:
-; CHECK-NEXT: br label [[LOOP_1_HEADER_US]]
+; CHECK-NEXT: br label [[LOOP_1_HEADER_US]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: loop.4.header.preheader.us:
-; CHECK-NEXT: br i1 false, label [[LOOP_4_HEADER_PREHEADER_SPLIT4_US_SPLIT_US:%.*]], label [[LOOP_4_HEADER_PREHEADER_SPLIT4_US15:%.*]]
+; CHECK-NEXT: br i1 [[C_1:%.*]], label [[LOOP_4_HEADER_PREHEADER_SPLIT1_US_SPLIT_US:%.*]], label [[LOOP_4_HEADER_PREHEADER_SPLIT1_US9:%.*]]
; CHECK: loop.1.header.split.us.us:
; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US14:%.*]]
-; CHECK: loop.2.header.us.us12:
+; CHECK: loop.2.header.us.us:
; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_US_US13:%.*]]
; CHECK: loop.2.latch.us.us:
-; CHECK-NEXT: br i1 false, label [[LOOP_2_HEADER_US_US12:%.*]], label [[LOOP_4_HEADER_PREHEADER_SPLIT_US_US:%.*]]
-; CHECK: loop.2.header.split.us.us.us13:
-; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US3_US:%.*]]
-; CHECK: loop.3.header.us.us1.us:
+; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US14]], label [[LOOP_4_HEADER_PREHEADER_SPLIT_US_US:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: loop.2.header.split.us.us.us:
; CHECK-NEXT: br label [[LOOP_3_LATCH_US_US2_US:%.*]]
-; CHECK: loop.3.latch.us.us2.us:
+; CHECK: loop.3.header.us.us.us:
; CHECK-NEXT: br label [[LOOP_2_LATCH_SPLIT_US_US_US:%.*]]
+; CHECK: loop.3.latch.us.us.us:
+; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_3_LATCH_US_US2_US]], label [[LOOP_2_LATCH_SPLIT_US_US_US1:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: loop.2.latch.split.us.us.us:
-; CHECK-NEXT: br label [[LOOP_2_LATCH_US_US:%.*]]
-; CHECK: loop.2.header.split.us.split.us3.us:
; CHECK-NEXT: br label [[LOOP_3_HEADER_US_US1_US:%.*]]
; CHECK: loop.4.header.preheader.split.us.us:
-; CHECK-NEXT: br label [[LOOP_4_HEADER_PREHEADER_US:%.*]]
-; CHECK: loop.1.header.split.us.split.us14:
-; CHECK-NEXT: br label [[LOOP_2_HEADER_US_US12]]
-; CHECK: loop.4.header.preheader.split4.us15:
+; CHECK-NEXT: br label [[LOOP_2_HEADER_US_US12:%.*]]
+; CHECK: loop.4.header.preheader.split1.us9:
; CHECK-NEXT: br label [[LOOP_4_HEADER_US5]]
-; CHECK: loop.4.header.preheader.split4.us.split.us:
-; CHECK-NEXT: br label [[LOOP_4_HEADER_PREHEADER_SPLIT4_US:%.*]]
-; CHECK: loop.1.header.split.us.split.us.split.us:
+; CHECK: loop.4.header.preheader.split1.us.split.us:
; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US:%.*]]
; CHECK: entry.split:
; CHECK-NEXT: br label [[LOOP_1_HEADER:%.*]]
@@ -71,36 +55,20 @@ define void @test_pr58564(i16 %a, i1 %c.1, ptr %dst) {
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i16 [[A]], -6
; CHECK-NEXT: br i1 [[TMP1]], label [[LOOP_1_HEADER_SPLIT_US:%.*]], label [[LOOP_1_HEADER_SPLIT:%.*]]
; CHECK: loop.1.header.split.us:
-; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US_SPLIT:%.*]], label [[LOOP_1_HEADER_SPLIT_US_SPLIT:%.*]]
-; CHECK: loop.1.header.split.us.split.us.split:
-; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US]]
-; CHECK: loop.1.header.split.us.split.us:
-; CHECK-NEXT: br label [[LOOP_2_HEADER_US_US:%.*]]
-; CHECK: loop.2.header.us.us:
-; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_US_US:%.*]]
-; CHECK: loop.2.header.split.us.us.us:
-; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US_SPLIT_US_SPLIT_US:%.*]]
-; CHECK: loop.2.header.split.us.split.us.split.us.split.us:
-; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US_SPLIT_US:%.*]]
-; CHECK: loop.1.header.split.us.split:
; CHECK-NEXT: br label [[LOOP_2_HEADER_US:%.*]]
; CHECK: loop.2.header.us:
; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_US:%.*]]
; CHECK: loop.2.latch.us:
-; CHECK-NEXT: br i1 false, label [[LOOP_2_HEADER_US]], label [[LOOP_4_HEADER_PREHEADER_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_2_HEADER_US]], label [[LOOP_4_HEADER_PREHEADER_SPLIT_US:%.*]], !llvm.loop [[LOOP3]]
; CHECK: loop.2.header.split.us.us:
-; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US3:%.*]]
-; CHECK: loop.3.header.us.us1:
; CHECK-NEXT: br label [[LOOP_3_LATCH_US_US2:%.*]]
-; CHECK: loop.3.latch.us.us2:
+; CHECK: loop.3.header.us.us:
; CHECK-NEXT: br label [[LOOP_2_LATCH_SPLIT_US_US:%.*]]
+; CHECK: loop.3.latch.us.us:
+; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_3_LATCH_US_US2]], label [[LOOP_2_LATCH_SPLIT_US_US1:%.*]], !llvm.loop [[LOOP4]]
; CHECK: loop.2.latch.split.us.us:
-; CHECK-NEXT: br label [[LOOP_2_LATCH_US:%.*]]
-; CHECK: loop.2.header.split.us.split.us3:
; CHECK-NEXT: br label [[LOOP_3_HEADER_US_US1:%.*]]
; CHECK: loop.4.header.preheader.split.us:
-; CHECK-NEXT: br label [[LOOP_4_HEADER_PREHEADER:%.*]]
-; CHECK: loop.2.header.split.us.split.us.split.us:
; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US:%.*]]
; CHECK: loop.1.header.split:
; CHECK-NEXT: br label [[LOOP_2_HEADER:%.*]]
@@ -108,21 +76,11 @@ define void @test_pr58564(i16 %a, i1 %c.1, ptr %dst) {
; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i16 [[A]], -6
; CHECK-NEXT: br i1 [[TMP2]], label [[LOOP_2_HEADER_SPLIT_US:%.*]], label [[LOOP_2_HEADER_SPLIT:%.*]]
; CHECK: loop.2.header.split.us:
-; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US_SPLIT:%.*]], label [[LOOP_2_HEADER_SPLIT_US_SPLIT:%.*]]
-; CHECK: loop.2.header.split.us.split.us.split:
-; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US]]
-; CHECK: loop.2.header.split.us.split.us:
-; CHECK-NEXT: br label [[LOOP_3_HEADER_US_US:%.*]]
-; CHECK: loop.3.header.us.us:
-; CHECK-NEXT: br label [[LOOP_3_LATCH_US_US:%.*]]
-; CHECK: loop.3.latch.us.us:
-; CHECK-NEXT: br label [[LOOP_3_HEADER_US_US]]
-; CHECK: loop.2.header.split.us.split:
; CHECK-NEXT: br label [[LOOP_3_HEADER_US:%.*]]
; CHECK: loop.3.header.us:
; CHECK-NEXT: br label [[LOOP_3_LATCH_US:%.*]]
; CHECK: loop.3.latch.us:
-; CHECK-NEXT: br label [[LOOP_2_LATCH_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_3_HEADER_US]], label [[LOOP_2_LATCH_SPLIT_US:%.*]], !llvm.loop [[LOOP4]]
; CHECK: loop.2.latch.split.us:
; CHECK-NEXT: br label [[LOOP_2_LATCH:%.*]]
; CHECK: loop.2.header.split:
@@ -134,18 +92,18 @@ define void @test_pr58564(i16 %a, i1 %c.1, ptr %dst) {
; CHECK-NEXT: call void @clobber()
; CHECK-NEXT: br label [[LOOP_3_LATCH]]
; CHECK: loop.3.latch:
-; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_3_HEADER]], label [[LOOP_2_LATCH_SPLIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_3_HEADER]], label [[LOOP_2_LATCH_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: loop.2.latch.split:
; CHECK-NEXT: br label [[LOOP_2_LATCH]]
; CHECK: loop.2.latch:
-; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_2_HEADER]], label [[LOOP_4_HEADER_PREHEADER_SPLIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_2_HEADER]], label [[LOOP_4_HEADER_PREHEADER_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: loop.4.header.preheader.split:
-; CHECK-NEXT: br label [[LOOP_4_HEADER_PREHEADER]]
+; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US]]
; CHECK: loop.4.header.preheader:
; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_4_HEADER_PREHEADER_SPLIT4_US_SPLIT:%.*]], label [[LOOP_4_HEADER_PREHEADER_SPLIT4:%.*]]
-; CHECK: loop.4.header.preheader.split4.us.split:
-; CHECK-NEXT: br label [[LOOP_4_HEADER_PREHEADER_SPLIT4_US]]
-; CHECK: loop.4.header.preheader.split4.us:
+; CHECK: loop.4.header.preheader.split1.us.split:
+; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US]]
+; CHECK: loop.4.header.preheader.split1.us:
; CHECK-NEXT: br label [[LOOP_4_HEADER_US:%.*]]
; CHECK: loop.4.header.us:
; CHECK-NEXT: br label [[LOOP_5_US:%.*]]
@@ -158,7 +116,7 @@ define void @test_pr58564(i16 %a, i1 %c.1, ptr %dst) {
; CHECK-NEXT: br i1 [[EC_US]], label [[LOOP_5_US]], label [[LOOP_4_LATCH_US:%.*]]
; CHECK: loop.4.latch.us:
; CHECK-NEXT: br label [[LOOP_4_HEADER_US]]
-; CHECK: loop.4.header.preheader.split4:
+; CHECK: loop.4.header.preheader.split1:
; CHECK-NEXT: br label [[LOOP_4_HEADER:%.*]]
; CHECK: loop.4.header:
; CHECK-NEXT: br label [[LOOP_5:%.*]]
@@ -168,11 +126,11 @@ define void @test_pr58564(i16 %a, i1 %c.1, ptr %dst) {
; CHECK-NEXT: store ptr null, ptr [[GEP]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i16 [[IV]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp ne i16 [[IV]], 10000
-; CHECK-NEXT: br i1 [[EC]], label [[LOOP_5]], label [[LOOP_4_LATCH:%.*]]
+; CHECK-NEXT: br i1 [[EC]], label [[LOOP_5]], label [[LOOP_4_LATCH:%.*]], !llvm.loop [[LOOP0]]
; CHECK: loop.4.latch:
; CHECK-NEXT: br label [[LOOP_1_LATCH:%.*]]
; CHECK: loop.1.latch:
-; CHECK-NEXT: br label [[LOOP_1_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br label [[LOOP_1_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
;
entry:
br label %loop.1.header
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
index 1d89420..108b2406 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
@@ -19,7 +19,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
; CHECK: loop.latch.us:
; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: entry.split:
@@ -37,7 +37,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
; CHECK: loop.latch:
; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: exit.split:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
@@ -84,7 +84,7 @@ define i32 @partial_unswitch_false_successor(ptr %ptr, i32 %N) {
; CHECK: loop.latch.us:
; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: entry.split:
@@ -102,7 +102,7 @@ define i32 @partial_unswitch_false_successor(ptr %ptr, i32 %N) {
; CHECK: loop.latch:
; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: exit.split:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
@@ -151,7 +151,7 @@ define i32 @partial_unswtich_gep_load_icmp(ptr %ptr, i32 %N) {
; CHECK: loop.latch.us:
; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: entry.split:
@@ -171,7 +171,7 @@ define i32 @partial_unswtich_gep_load_icmp(ptr %ptr, i32 %N) {
; CHECK: loop.latch:
; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: exit.split:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
@@ -223,7 +223,7 @@ define i32 @partial_unswitch_reduction_phi(ptr %ptr, i32 %N) {
; CHECK-NEXT: [[RED_NEXT_US]] = phi i32 [ [[ADD_10_US]], [[NOCLOBBER_US]] ]
; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: [[RED_NEXT_LCSSA_US:%.*]] = phi i32 [ [[RED_NEXT_US]], [[LOOP_LATCH_US]] ]
; CHECK-NEXT: br label [[EXIT:%.*]]
@@ -246,7 +246,7 @@ define i32 @partial_unswitch_reduction_phi(ptr %ptr, i32 %N) {
; CHECK-NEXT: [[RED_NEXT]] = phi i32 [ [[ADD_5]], [[CLOBBER]] ], [ [[ADD_10]], [[NOCLOBBER]] ]
; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: exit.split:
; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP_LATCH]] ]
; CHECK-NEXT: br label [[EXIT]]
@@ -305,7 +305,7 @@ define i32 @partial_unswitch_true_successor_noclobber(ptr noalias %ptr.1, ptr no
; CHECK: loop.latch.us:
; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: entry.split:
@@ -325,7 +325,7 @@ define i32 @partial_unswitch_true_successor_noclobber(ptr noalias %ptr.1, ptr no
; CHECK: loop.latch:
; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: exit.split:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
@@ -619,7 +619,7 @@ define i32 @partial_unswitch_true_successor_preheader_insertion(ptr %ptr, i32 %N
; CHECK: loop.latch.us:
; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_LOOPEXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_LOOPEXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: exit.loopexit.split.us:
; CHECK-NEXT: br label [[EXIT_LOOPEXIT:%.*]]
; CHECK: loop.ph.split:
@@ -637,7 +637,7 @@ define i32 @partial_unswitch_true_successor_preheader_insertion(ptr %ptr, i32 %N
; CHECK: loop.latch:
; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: exit.loopexit.split:
; CHECK-NEXT: br label [[EXIT_LOOPEXIT]]
; CHECK: exit.loopexit:
@@ -695,7 +695,7 @@ define i32 @partial_unswitch_true_successor_insert_point(ptr %ptr, i32 %N) {
; CHECK: loop.latch.us:
; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: entry.split:
@@ -713,7 +713,7 @@ define i32 @partial_unswitch_true_successor_insert_point(ptr %ptr, i32 %N) {
; CHECK: loop.latch:
; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: exit.split:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
@@ -765,7 +765,7 @@ define i32 @partial_unswitch_true_successor_hoist_invariant(ptr %ptr, i32 %N) {
; CHECK: loop.latch.us:
; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: entry.split:
@@ -784,7 +784,7 @@ define i32 @partial_unswitch_true_successor_hoist_invariant(ptr %ptr, i32 %N) {
; CHECK: loop.latch:
; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: exit.split:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
@@ -1057,7 +1057,7 @@ define i32 @partial_unswitch_true_to_latch(ptr %ptr, i32 %N) {
; CHECK: loop.latch.us:
; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: entry.split:
@@ -1073,7 +1073,7 @@ define i32 @partial_unswitch_true_to_latch(ptr %ptr, i32 %N) {
; CHECK: loop.latch:
; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK: exit.split:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
@@ -1112,19 +1112,11 @@ define i32 @partial_unswitch_exiting_block_with_multiple_unswitch_candidates(i32
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 41
; CHECK-NEXT: br i1 [[TMP3]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_US:%.*]]
; CHECK: entry.split.us:
-; CHECK-NEXT: br i1 [[EXIT_COND]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]]
-; CHECK: entry.split.us.split.us:
-; CHECK-NEXT: br label [[LOOP_US_US:%.*]]
-; CHECK: loop.us.us:
-; CHECK-NEXT: br label [[EXITING_US_US:%.*]]
-; CHECK: exiting.us.us:
-; CHECK-NEXT: br label [[LOOP_US_US]]
-; CHECK: entry.split.us.split:
; CHECK-NEXT: br label [[LOOP_US:%.*]]
; CHECK: loop.us:
; CHECK-NEXT: br label [[EXITING_US:%.*]]
; CHECK: exiting.us:
-; CHECK-NEXT: br label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[EXIT_COND]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: [[RET_VAL_US:%.*]] = phi i32 [ 1, [[EXITING_US]] ]
; CHECK-NEXT: br label [[EXIT:%.*]]
@@ -1138,7 +1130,7 @@ define i32 @partial_unswitch_exiting_block_with_multiple_unswitch_candidates(i32
; CHECK-NEXT: store i32 [[TMP1:%.*]], ptr [[PTR]], align 16
; CHECK-NEXT: br label [[EXITING]]
; CHECK: exiting:
-; CHECK-NEXT: br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK: exit.split:
; CHECK-NEXT: [[RET_VAL:%.*]] = phi i32 [ 1, [[EXITING]] ]
; CHECK-NEXT: br label [[EXIT]]
@@ -1185,7 +1177,7 @@ define i32 @partial_unswitch_true_successor_for_cost_calculation(ptr %ptr, i32 %
; CHECK: loop.latch.us:
; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: entry.split:
@@ -1249,7 +1241,7 @@ define i32 @partial_unswitch_true_successor_for_cost_calculation(ptr %ptr, i32 %
; CHECK: loop.latch:
; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK: exit.split:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
@@ -1342,7 +1334,7 @@ define i32 @partial_unswitch_true_successor_trunc(ptr %ptr, i32 %N) {
; CHECK: loop.latch.us:
; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: entry.split:
@@ -1360,7 +1352,7 @@ define i32 @partial_unswitch_true_successor_trunc(ptr %ptr, i32 %N) {
; CHECK: loop.latch:
; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP25:![0-9]+]]
; CHECK: exit.split:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
@@ -1407,7 +1399,7 @@ define i32 @partial_unswitch_false_successor_trunc(ptr %ptr, i32 %N) {
; CHECK: loop.latch.us:
; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP26:![0-9]+]]
; CHECK: exit.split.us:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: entry.split:
@@ -1425,7 +1417,7 @@ define i32 @partial_unswitch_false_successor_trunc(ptr %ptr, i32 %N) {
; CHECK: loop.latch:
; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP27:![0-9]+]]
; CHECK: exit.split:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
@@ -1456,15 +1448,15 @@ exit:
ret i32 10
}
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[UNSWITCH_PARTIAL_DISABLE:![0-9]+]]}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[UNSWITCH_PARTIAL_DISABLE:![0-9]+]]}
; CHECK: [[UNSWITCH_PARTIAL_DISABLE]] = !{!"llvm.loop.unswitch.partial.disable"}
-; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[UNSWITCH_PARTIAL_DISABLE]]}
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[UNSWITCH_PARTIAL_DISABLE]]}
; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[UNSWITCH_PARTIAL_DISABLE]]}
; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[UNSWITCH_PARTIAL_DISABLE]]}
; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[UNSWITCH_PARTIAL_DISABLE]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[UNSWITCH_PARTIAL_DISABLE]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[UNSWITCH_PARTIAL_DISABLE]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[UNSWITCH_PARTIAL_DISABLE]]}
+; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[UNSWITCH_PARTIAL_DISABLE]]}
+; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[UNSWITCH_PARTIAL_DISABLE]]}
+; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[UNSWITCH_PARTIAL_DISABLE]]}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll b/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll
new file mode 100644
index 0000000..e24d17f
--- /dev/null
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes="loop-mssa(loop-simplifycfg,licm,loop-rotate,simple-loop-unswitch<nontrivial>)" < %s | FileCheck %s
+
+@a = global i32 0, align 4
+@b = global i32 0, align 4
+@c = global i32 0, align 4
+@d = global i32 0, align 4
+
+define i32 @main() {
+entry:
+ br label %outer.loop.header
+
+outer.loop.header: ; preds = %outer.loop.latch, %entry
+ br i1 false, label %exit, label %outer.loop.body
+
+outer.loop.body: ; preds = %inner.loop.header, %outer.loop.header
+ store i32 1, ptr @c, align 4
+ %cmp = icmp sgt i32 0, -1
+ br i1 %cmp, label %outer.loop.latch, label %exit
+
+inner.loop.header: ; preds = %outer.loop.latch, %inner.loop.body
+ %a_val = load i32, ptr @a, align 4
+ %c_val = load i32, ptr @c, align 4
+ %mul = mul nsw i32 %c_val, %a_val
+ store i32 %mul, ptr @b, align 4
+ %cmp2 = icmp sgt i32 %mul, -1
+ br i1 %cmp2, label %inner.loop.body, label %outer.loop.body
+
+inner.loop.body: ; preds = %inner.loop.header
+ %mul2 = mul nsw i32 %c_val, 3
+ store i32 %mul2, ptr @c, align 4
+ store i32 %c_val, ptr @d, align 4
+ %mul3 = mul nsw i32 %c_val, %a_val
+ %cmp3 = icmp sgt i32 %mul3, -1
+ br i1 %cmp3, label %inner.loop.header, label %exit
+
+outer.loop.latch: ; preds = %outer.loop.body
+ %d_val = load i32, ptr @d, align 4
+ store i32 %d_val, ptr @b, align 4
+ %cmp4 = icmp eq i32 %d_val, 0
+ br i1 %cmp4, label %inner.loop.header, label %outer.loop.header
+
+exit: ; preds = %inner.loop.body, %outer.loop.body, %outer.loop.header
+ ret i32 0
+}
+
+; CHECK: [[LOOP0:.*]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unswitch.nontrivial.disable"}
+; CHECK: [[LOOP2:.*]] = distinct !{[[LOOP2]], [[META1]]}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/update-scev-3.ll b/llvm/test/Transforms/SimpleLoopUnswitch/update-scev-3.ll
index ef00d7e..4e428cb 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/update-scev-3.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/update-scev-3.ll
@@ -19,56 +19,42 @@ define i32 @foo(i1 %not) {
; CHECK-NEXT: [[FALSE:%.*]] = and i1 true, false
; CHECK-NEXT: br i1 [[NOT]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
; CHECK: entry.split.us:
-; CHECK-NEXT: br i1 [[FALSE]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]]
-; CHECK: entry.split.us.split.us:
-; CHECK-NEXT: br label [[FOR_COND_US_US:%.*]]
-; CHECK: for.cond.us.us:
-; CHECK-NEXT: br label [[FOR_COND_SPLIT_US_US_US:%.*]]
-; CHECK: for.cond.split.us.us.us:
-; CHECK-NEXT: br label [[FOR_COND_SPLIT_US_SPLIT_US_SPLIT_US_SPLIT_US:%.*]]
-; CHECK: for.cond.split.us.split.us.split.us.split.us:
-; CHECK-NEXT: br label [[FOR_COND_SPLIT_US_SPLIT_US_SPLIT_US:%.*]]
-; CHECK: entry.split.us.split:
; CHECK-NEXT: br label [[FOR_COND_US:%.*]]
; CHECK: for.cond.us:
; CHECK-NEXT: br label [[FOR_COND_SPLIT_US_US:%.*]]
; CHECK: for.inc11.us:
-; CHECK-NEXT: br label [[FOR_COND_US]]
+; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: for.cond.split.us.us:
-; CHECK-NEXT: br label [[FOR_COND_SPLIT_US_SPLIT_US11:%.*]]
-; CHECK: for.cond5.preheader.us.us9:
-; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_US_US10:%.*]]
+; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US_US:%.*]]
+; CHECK: for.cond5.preheader.us.us:
+; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_US_US:%.*]]
; CHECK: for.inc8.us.us:
-; CHECK-NEXT: br i1 false, label [[FOR_INC8_FOR_COND5_PREHEADER_CRIT_EDGE_US_US:%.*]], label [[FOR_INC11_SPLIT_US_US:%.*]]
+; CHECK-NEXT: br i1 [[FALSE]], label [[FOR_INC8_FOR_COND5_PREHEADER_CRIT_EDGE_US_US:%.*]], label [[FOR_INC11_SPLIT_US_US:%.*]]
; CHECK: for.inc8.for.cond5.preheader_crit_edge.us.us:
-; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US_US9:%.*]]
+; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US_US]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: for.end.us.us:
-; CHECK-NEXT: br i1 false, label [[FOR_INC8_US_US:%.*]], label [[CLEANUP15_SPLIT_US_SPLIT_US:%.*]]
-; CHECK: for.cond5.preheader.split.us.us.us10:
-; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_SPLIT_US7_US:%.*]]
-; CHECK: for.body7.us.us4.us:
-; CHECK-NEXT: br label [[HANDLER_POINTER_OVERFLOW_US_US5_US:%.*]]
-; CHECK: handler.pointer_overflow.us.us5.us:
-; CHECK-NEXT: br label [[CONT_US_US6_US:%.*]]
-; CHECK: cont.us.us6.us:
-; CHECK-NEXT: br label [[FOR_END_SPLIT_US_US_US:%.*]]
+; CHECK-NEXT: br i1 [[FALSE]], label [[FOR_INC8_US_US:%.*]], label [[CLEANUP15_SPLIT_US_SPLIT_US:%.*]]
+; CHECK: for.cond5.preheader.split.us.us.us:
+; CHECK-NEXT: br label [[FOR_BODY7_US_US_US:%.*]]
+; CHECK: for.body7.us.us.us:
+; CHECK-NEXT: br label [[HANDLER_POINTER_OVERFLOW_US_US_US:%.*]]
+; CHECK: handler.pointer_overflow.us.us.us:
+; CHECK-NEXT: br label [[CONT_US_US_US:%.*]]
+; CHECK: cont.us.us.us:
+; CHECK-NEXT: br i1 [[FALSE]], label [[CONT_FOR_BODY7_CRIT_EDGE_US_US_US:%.*]], label [[FOR_END_SPLIT_US_US_US:%.*]]
+; CHECK: cont.for.body7_crit_edge.us.us.us:
+; CHECK-NEXT: br label [[FOR_BODY7_US_US_US]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.end.split.us.us.us:
; CHECK-NEXT: br label [[FOR_END_US_US:%.*]]
-; CHECK: for.cond5.preheader.split.us.split.us7.us:
-; CHECK-NEXT: br label [[FOR_BODY7_US_US4_US:%.*]]
; CHECK: for.inc11.split.us.us:
; CHECK-NEXT: br label [[FOR_INC11_US:%.*]]
-; CHECK: for.cond.split.us.split.us11:
-; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US_US9]]
-; CHECK: for.cond.split.us.split.us.split.us:
-; CHECK-NEXT: br label [[FOR_COND_SPLIT_US_SPLIT_US:%.*]]
; CHECK: cleanup15.split.us.split.us:
; CHECK-NEXT: br label [[CLEANUP15_SPLIT_US:%.*]]
; CHECK: entry.split:
; CHECK-NEXT: br i1 [[FALSE]], label [[ENTRY_SPLIT_SPLIT_US:%.*]], label [[ENTRY_SPLIT_SPLIT:%.*]]
; CHECK: entry.split.split.us:
-; CHECK-NEXT: br label [[FOR_COND_US12:%.*]]
-; CHECK: for.cond.us12:
+; CHECK-NEXT: br label [[FOR_COND_US5:%.*]]
+; CHECK: for.cond.us5:
; CHECK-NEXT: br label [[FOR_COND_SPLIT_US:%.*]]
; CHECK: for.cond.split.us:
; CHECK-NEXT: br label [[FOR_COND_SPLIT_SPLIT_US_SPLIT_US:%.*]]
@@ -78,23 +64,13 @@ define i32 @foo(i1 %not) {
; CHECK-NEXT: br label [[FOR_COND:%.*]]
; CHECK: for.cond:
; CHECK-NEXT: br label [[FOR_COND_SPLIT:%.*]]
-; CHECK: for.cond.split.us.split.us:
-; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US_US:%.*]]
-; CHECK: for.cond5.preheader.us.us:
-; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_US_US:%.*]]
-; CHECK: for.cond5.preheader.split.us.us.us:
-; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_SPLIT_US_SPLIT_US_SPLIT_US:%.*]]
-; CHECK: for.cond5.preheader.split.us.split.us.split.us.split.us:
-; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_SPLIT_US_SPLIT_US:%.*]]
; CHECK: cleanup15.split.us:
; CHECK-NEXT: br label [[CLEANUP15:%.*]]
-; CHECK: for.cond5.preheader.split.us.split.us.split.us:
-; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_SPLIT_US:%.*]]
; CHECK: for.cond.split:
; CHECK-NEXT: br label [[FOR_COND_SPLIT_SPLIT:%.*]]
; CHECK: for.cond.split.split.us:
-; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US8:%.*]]
-; CHECK: for.cond5.preheader.us8:
+; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US4:%.*]]
+; CHECK: for.cond5.preheader.us4:
; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US:%.*]]
; CHECK: for.cond5.preheader.split.us:
; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_SPLIT_US_SPLIT_US:%.*]]
@@ -104,16 +80,6 @@ define i32 @foo(i1 %not) {
; CHECK-NEXT: br label [[FOR_COND5_PREHEADER:%.*]]
; CHECK: for.cond5.preheader:
; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT:%.*]]
-; CHECK: for.cond5.preheader.split.us.split.us:
-; CHECK-NEXT: br label [[FOR_BODY7_US_US:%.*]]
-; CHECK: for.body7.us.us:
-; CHECK-NEXT: br label [[HANDLER_POINTER_OVERFLOW_US_US:%.*]]
-; CHECK: handler.pointer_overflow.us.us:
-; CHECK-NEXT: br label [[CONT_US_US:%.*]]
-; CHECK: cont.us.us:
-; CHECK-NEXT: br label [[CONT_FOR_BODY7_CRIT_EDGE_US_US:%.*]]
-; CHECK: cont.for.body7_crit_edge.us.us:
-; CHECK-NEXT: br label [[FOR_BODY7_US_US]]
; CHECK: for.cond5.preheader.split:
; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_SPLIT:%.*]]
; CHECK: for.cond5.preheader.split.split.us:
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/empty-cleanuppad.ll b/llvm/test/Transforms/SimplifyCFG/X86/empty-cleanuppad.ll
index 17ce141..162a3ab 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/empty-cleanuppad.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/empty-cleanuppad.ll
@@ -502,6 +502,7 @@ cleanupret2:
define void @f11() personality ptr @__CxxFrameHandler3 {
; CHECK-LABEL: @f11(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[X:%.*]] = alloca i8, align 1
; CHECK-NEXT: invoke void @g()
; CHECK-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
; CHECK: invoke.cont:
@@ -519,6 +520,7 @@ define void @f11() personality ptr @__CxxFrameHandler3 {
; CHECK-NEXT: ret void
;
entry:
+ %x = alloca i8
invoke void @g()
to label %invoke.cont unwind label %ehcleanup
@@ -531,7 +533,6 @@ invoke.cont2: ; preds = %invoke.cont
to label %return unwind label %catch.dispatch
ehcleanup: ; preds = %invoke.cont, %entry
- %x = phi ptr [ undef, %invoke.cont ], [ undef, %entry ]
%0 = cleanuppad within none []
call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %x)
cleanupret from %0 unwind label %catch.dispatch
diff --git a/llvm/test/Transforms/SimplifyCFG/invoke_unwind_lifetime.ll b/llvm/test/Transforms/SimplifyCFG/invoke_unwind_lifetime.ll
index ff031e9..ea14b17 100644
--- a/llvm/test/Transforms/SimplifyCFG/invoke_unwind_lifetime.ll
+++ b/llvm/test/Transforms/SimplifyCFG/invoke_unwind_lifetime.ll
@@ -67,17 +67,17 @@ invoke.cont:
lpad.v0:
%i8 = landingpad { ptr, i32 } cleanup
call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %i0)
+ call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %i4)
br label %end
lpad.v1:
%i9 = landingpad { ptr, i32 } cleanup
call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %i2)
+ call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %i6)
br label %end
end:
%i10 = phi { ptr, i32 } [ %i8, %lpad.v0 ], [ %i9, %lpad.v1 ]
- %i11 = phi ptr [ %i4, %lpad.v0 ], [ %i6, %lpad.v1 ]
- call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %i11)
resume { ptr, i32 } %i10
}
;.
diff --git a/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll
new file mode 100644
index 0000000..220556c
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll
@@ -0,0 +1,262 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- | FileCheck %s
+
+; Negative test: bitcast from float to int (optimization should not apply)
+define <4 x i32> @and_bitcast_v4f32_to_v4i32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @and_bitcast_v4f32_to_v4i32(
+; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT: [[BC2:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[BC1]], [[BC2]]
+; CHECK-NEXT: ret <4 x i32> [[AND]]
+;
+ %bc1 = bitcast <4 x float> %a to <4 x i32>
+ %bc2 = bitcast <4 x float> %b to <4 x i32>
+ %and = and <4 x i32> %bc1, %bc2
+ ret <4 x i32> %and
+}
+
+; Test bitwise operations with integer-to-integer bitcast
+define <2 x i32> @or_bitcast_v4i16_to_v2i32(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @or_bitcast_v4i16_to_v2i32(
+; CHECK-NEXT: [[B:%.*]] = or <4 x i16> [[A:%.*]], [[B1:%.*]]
+; CHECK-NEXT: [[BC2:%.*]] = bitcast <4 x i16> [[B]] to <2 x i32>
+; CHECK-NEXT: ret <2 x i32> [[BC2]]
+;
+ %bc1 = bitcast <4 x i16> %a to <2 x i32>
+ %bc2 = bitcast <4 x i16> %b to <2 x i32>
+ %or = or <2 x i32> %bc1, %bc2
+ ret <2 x i32> %or
+}
+
+define <16 x i8> @xor_bitcast_v2i64_to_v16i8(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @xor_bitcast_v2i64_to_v16i8(
+; CHECK-NEXT: [[B:%.*]] = xor <2 x i64> [[A:%.*]], [[B1:%.*]]
+; CHECK-NEXT: [[BC2:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+; CHECK-NEXT: ret <16 x i8> [[BC2]]
+;
+ %bc1 = bitcast <2 x i64> %a to <16 x i8>
+ %bc2 = bitcast <2 x i64> %b to <16 x i8>
+ %xor = xor <16 x i8> %bc1, %bc2
+ ret <16 x i8> %xor
+}
+
+; Test bitwise operations with truncate
+define <4 x i16> @and_trunc_v4i32_to_v4i16(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @and_trunc_v4i32_to_v4i16(
+; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[AND:%.*]] = trunc <4 x i32> [[AND_INNER]] to <4 x i16>
+; CHECK-NEXT: ret <4 x i16> [[AND]]
+;
+ %t1 = trunc <4 x i32> %a to <4 x i16>
+ %t2 = trunc <4 x i32> %b to <4 x i16>
+ %and = and <4 x i16> %t1, %t2
+ ret <4 x i16> %and
+}
+
+define <8 x i8> @or_trunc_v8i16_to_v8i8(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @or_trunc_v8i16_to_v8i8(
+; CHECK-NEXT: [[OR_INNER:%.*]] = or <8 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[OR:%.*]] = trunc <8 x i16> [[OR_INNER]] to <8 x i8>
+; CHECK-NEXT: ret <8 x i8> [[OR]]
+;
+ %t1 = trunc <8 x i16> %a to <8 x i8>
+ %t2 = trunc <8 x i16> %b to <8 x i8>
+ %or = or <8 x i8> %t1, %t2
+ ret <8 x i8> %or
+}
+
+define <2 x i32> @xor_trunc_v2i64_to_v2i32(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @xor_trunc_v2i64_to_v2i32(
+; CHECK-NEXT: [[XOR_INNER:%.*]] = xor <2 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[XOR:%.*]] = trunc <2 x i64> [[XOR_INNER]] to <2 x i32>
+; CHECK-NEXT: ret <2 x i32> [[XOR]]
+;
+ %t1 = trunc <2 x i64> %a to <2 x i32>
+ %t2 = trunc <2 x i64> %b to <2 x i32>
+ %xor = xor <2 x i32> %t1, %t2
+ ret <2 x i32> %xor
+}
+
+; Test bitwise operations with zero extend
+define <4 x i32> @and_zext_v4i16_to_v4i32(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @and_zext_v4i16_to_v4i32(
+; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[AND:%.*]] = zext <4 x i16> [[AND_INNER]] to <4 x i32>
+; CHECK-NEXT: ret <4 x i32> [[AND]]
+;
+ %z1 = zext <4 x i16> %a to <4 x i32>
+ %z2 = zext <4 x i16> %b to <4 x i32>
+ %and = and <4 x i32> %z1, %z2
+ ret <4 x i32> %and
+}
+
+define <8 x i16> @or_zext_v8i8_to_v8i16(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @or_zext_v8i8_to_v8i16(
+; CHECK-NEXT: [[OR_INNER:%.*]] = or <8 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[OR:%.*]] = zext <8 x i8> [[OR_INNER]] to <8 x i16>
+; CHECK-NEXT: ret <8 x i16> [[OR]]
+;
+ %z1 = zext <8 x i8> %a to <8 x i16>
+ %z2 = zext <8 x i8> %b to <8 x i16>
+ %or = or <8 x i16> %z1, %z2
+ ret <8 x i16> %or
+}
+
+define <2 x i64> @xor_zext_v2i32_to_v2i64(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @xor_zext_v2i32_to_v2i64(
+; CHECK-NEXT: [[XOR_INNER:%.*]] = xor <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[XOR:%.*]] = zext <2 x i32> [[XOR_INNER]] to <2 x i64>
+; CHECK-NEXT: ret <2 x i64> [[XOR]]
+;
+ %z1 = zext <2 x i32> %a to <2 x i64>
+ %z2 = zext <2 x i32> %b to <2 x i64>
+ %xor = xor <2 x i64> %z1, %z2
+ ret <2 x i64> %xor
+}
+
+; Test bitwise operations with sign extend
+define <4 x i32> @and_sext_v4i16_to_v4i32(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @and_sext_v4i16_to_v4i32(
+; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[AND:%.*]] = sext <4 x i16> [[AND_INNER]] to <4 x i32>
+; CHECK-NEXT: ret <4 x i32> [[AND]]
+;
+ %s1 = sext <4 x i16> %a to <4 x i32>
+ %s2 = sext <4 x i16> %b to <4 x i32>
+ %and = and <4 x i32> %s1, %s2
+ ret <4 x i32> %and
+}
+
+define <8 x i16> @or_sext_v8i8_to_v8i16(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @or_sext_v8i8_to_v8i16(
+; CHECK-NEXT: [[OR_INNER:%.*]] = or <8 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[OR:%.*]] = sext <8 x i8> [[OR_INNER]] to <8 x i16>
+; CHECK-NEXT: ret <8 x i16> [[OR]]
+;
+ %s1 = sext <8 x i8> %a to <8 x i16>
+ %s2 = sext <8 x i8> %b to <8 x i16>
+ %or = or <8 x i16> %s1, %s2
+ ret <8 x i16> %or
+}
+
+define <2 x i64> @xor_sext_v2i32_to_v2i64(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @xor_sext_v2i32_to_v2i64(
+; CHECK-NEXT: [[XOR_INNER:%.*]] = xor <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[XOR:%.*]] = sext <2 x i32> [[XOR_INNER]] to <2 x i64>
+; CHECK-NEXT: ret <2 x i64> [[XOR]]
+;
+ %s1 = sext <2 x i32> %a to <2 x i64>
+ %s2 = sext <2 x i32> %b to <2 x i64>
+ %xor = xor <2 x i64> %s1, %s2
+ ret <2 x i64> %xor
+}
+
+; Negative test: mismatched cast types (zext and sext)
+define <4 x i32> @and_zext_sext_mismatch(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @and_zext_sext_mismatch(
+; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT: [[S2:%.*]] = sext <4 x i16> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[Z1]], [[S2]]
+; CHECK-NEXT: ret <4 x i32> [[AND]]
+;
+ %z1 = zext <4 x i16> %a to <4 x i32>
+ %s2 = sext <4 x i16> %b to <4 x i32>
+ %and = and <4 x i32> %z1, %s2
+ ret <4 x i32> %and
+}
+
+; Negative test: mismatched source types
+define <4 x i32> @or_zext_different_src_types(<4 x i16> %a, <4 x i8> %b) {
+; CHECK-LABEL: @or_zext_different_src_types(
+; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT: [[Z2:%.*]] = zext <4 x i8> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT: [[OR:%.*]] = or <4 x i32> [[Z1]], [[Z2]]
+; CHECK-NEXT: ret <4 x i32> [[OR]]
+;
+ %z1 = zext <4 x i16> %a to <4 x i32>
+ %z2 = zext <4 x i8> %b to <4 x i32>
+ %or = or <4 x i32> %z1, %z2
+ ret <4 x i32> %or
+}
+
+; Negative test: scalar types (not vectors)
+define i32 @xor_zext_scalar(i16 %a, i16 %b) {
+; CHECK-LABEL: @xor_zext_scalar(
+; CHECK-NEXT: [[Z1:%.*]] = zext i16 [[A:%.*]] to i32
+; CHECK-NEXT: [[Z2:%.*]] = zext i16 [[B:%.*]] to i32
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[Z1]], [[Z2]]
+; CHECK-NEXT: ret i32 [[XOR]]
+;
+ %z1 = zext i16 %a to i32
+ %z2 = zext i16 %b to i32
+ %xor = xor i32 %z1, %z2
+ ret i32 %xor
+}
+
+; Test multi-use: one cast has multiple uses
+define <4 x i32> @and_zext_multiuse(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @and_zext_multiuse(
+; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i16> [[A]], [[B:%.*]]
+; CHECK-NEXT: [[AND:%.*]] = zext <4 x i16> [[AND_INNER]] to <4 x i32>
+; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[Z1]], [[AND]]
+; CHECK-NEXT: ret <4 x i32> [[ADD]]
+;
+ %z1 = zext <4 x i16> %a to <4 x i32>
+ %z2 = zext <4 x i16> %b to <4 x i32>
+ %and = and <4 x i32> %z1, %z2
+ %add = add <4 x i32> %z1, %and ; z1 has multiple uses
+ ret <4 x i32> %add
+}
+
+; Test with different vector sizes
+define <16 x i16> @or_zext_v16i8_to_v16i16(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @or_zext_v16i8_to_v16i16(
+; CHECK-NEXT: [[OR_INNER:%.*]] = or <16 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[OR:%.*]] = zext <16 x i8> [[OR_INNER]] to <16 x i16>
+; CHECK-NEXT: ret <16 x i16> [[OR]]
+;
+ %z1 = zext <16 x i8> %a to <16 x i16>
+ %z2 = zext <16 x i8> %b to <16 x i16>
+ %or = or <16 x i16> %z1, %z2
+ ret <16 x i16> %or
+}
+
+; Test bitcast with different element counts
+define <8 x i16> @xor_bitcast_v4i32_to_v8i16(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @xor_bitcast_v4i32_to_v8i16(
+; CHECK-NEXT: [[XOR_INNER:%.*]] = xor <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[XOR:%.*]] = bitcast <4 x i32> [[XOR_INNER]] to <8 x i16>
+; CHECK-NEXT: ret <8 x i16> [[XOR]]
+;
+ %bc1 = bitcast <4 x i32> %a to <8 x i16>
+ %bc2 = bitcast <4 x i32> %b to <8 x i16>
+ %xor = xor <8 x i16> %bc1, %bc2
+ ret <8 x i16> %xor
+}
+
+; Test truncate with flag preservation
+define <4 x i16> @and_trunc_nuw_nsw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @and_trunc_nuw_nsw(
+; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[AND:%.*]] = trunc nuw nsw <4 x i32> [[AND_INNER]] to <4 x i16>
+; CHECK-NEXT: ret <4 x i16> [[AND]]
+;
+ %t1 = trunc nuw nsw <4 x i32> %a to <4 x i16>
+ %t2 = trunc nuw nsw <4 x i32> %b to <4 x i16>
+ %and = and <4 x i16> %t1, %t2
+ ret <4 x i16> %and
+}
+
+; Test sign extend with nneg flag
+define <4 x i32> @or_zext_nneg(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @or_zext_nneg(
+; CHECK-NEXT: [[OR_INNER:%.*]] = or <4 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[OR:%.*]] = zext nneg <4 x i16> [[OR_INNER]] to <4 x i32>
+; CHECK-NEXT: ret <4 x i32> [[OR]]
+;
+ %z1 = zext nneg <4 x i16> %a to <4 x i32>
+ %z2 = zext nneg <4 x i16> %b to <4 x i32>
+ %or = or <4 x i32> %z1, %z2
+ ret <4 x i32> %or
+}
diff --git a/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll b/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll
new file mode 100644
index 0000000..af0d7f1
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll
@@ -0,0 +1,165 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+; --------------------------------------------------------------------
+; Wrong mangled types
+; --------------------------------------------------------------------
+
+; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <16 x i64> %A
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i64_fp8___v16i32_fp8(<16 x i64> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <16 x i64> %B
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i64_fp8(<16 x i32> %A, <16 x i64> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; --------------------------------------------------------------------
+; Impossible vector types
+; --------------------------------------------------------------------
+
+; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v9i32.v16i32(i32 0, <9 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <9 x i32> %A
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v9i32_fp8___v16i32_fp8(<9 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v9i32.v16i32(i32 0, <9 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v9i32(i32 0, <16 x i32> %A, i32 0, <9 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <9 x i32> %B
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v9i32_fp8(<16 x i32> %A, <9 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v9i32(i32 0, <16 x i32> %A, i32 0, <9 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; --------------------------------------------------------------------
+; Out of bounds format
+; --------------------------------------------------------------------
+
+; CHECK: invalid value for matrix format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 9999, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: i32 9999
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_invalid0___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 9999, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: invalid value for matrix format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 9999, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: i32 9999
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_invalid1(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 9999, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; --------------------------------------------------------------------
+; Incorrect signature for format cases (IR vector too small)
+; --------------------------------------------------------------------
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <8 x i32> %A
+; CHECK-NEXT: i32 0
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v8i32_fp8___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 0, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <12 x i32> %A
+; CHECK-NEXT: i32 0
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp8___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 0, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 1, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <8 x i32> %A
+; CHECK-NEXT: i32 1
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v8i32_bf8___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 1, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 1, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <12 x i32> %A
+; CHECK-NEXT: i32 1
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_bf8___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 1, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 0, <8 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <8 x i32> %B
+; CHECK-NEXT: i32 0
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_fp8(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 0, <8 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 0, <12 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <12 x i32> %B
+; CHECK-NEXT: i32 0
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp8(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 0, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 1, <8 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <8 x i32> %B
+; CHECK-NEXT: i32 1
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_bf8(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 1, <8 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 1, <12 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <12 x i32> %B
+; CHECK-NEXT: i32 1
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_bf8(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 1, <12 x i32> %B, i16 0, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/Verifier/amdgpu-cc.ll b/llvm/test/Verifier/amdgpu-cc.ll
index aec0977..e86825e 100644
--- a/llvm/test/Verifier/amdgpu-cc.ll
+++ b/llvm/test/Verifier/amdgpu-cc.ll
@@ -217,3 +217,36 @@ define amdgpu_cs_chain_preserve void @preallocated_cc_amdgpu_cs_chain_preserve(p
define amdgpu_cs_chain_preserve void @inalloca_cc_amdgpu_cs_chain_preserve(ptr inalloca(i32) %ptr) {
ret void
}
+
+; CHECK: Calling convention requires first argument to be i1
+; CHECK-NEXT: ptr @whole_wave_no_args
+define amdgpu_gfx_whole_wave void @whole_wave_no_args() {
+ ret void
+}
+
+; CHECK: Calling convention requires first argument to be i1
+; CHECK-NEXT: ptr @whole_wave_must_have_i1_active
+define amdgpu_gfx_whole_wave void @whole_wave_must_have_i1_active(i32 %x) {
+ ret void
+}
+
+; CHECK: Calling convention requires first argument to not be inreg
+; CHECK-NEXT: ptr @whole_wave_i1_active_inreg
+define amdgpu_gfx_whole_wave void @whole_wave_i1_active_inreg(i1 inreg %active) {
+ ret void
+}
+
+; CHECK: Calling convention does not support varargs
+; CHECK-NEXT: ptr @whole_wave_varargs
+define amdgpu_gfx_whole_wave void @whole_wave_varargs(i1 %active, i32 %x, ...) {
+ ret void
+}
+
+declare amdgpu_gfx_whole_wave void @whole_wave_callee(i1 %active)
+
+; CHECK: calling convention does not permit calls
+; CHECK-NEXT: call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true)
+define amdgpu_cs void @cant_call_whole_wave_func() {
+ call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true)
+ ret void
+}
diff --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll
index dd940d5..c1bb932 100644
--- a/llvm/test/Verifier/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/intrinsic-immarg.ll
@@ -164,19 +164,21 @@ define void @test_scatter_8i32(<8 x i32> %a1, <8 x ptr> %ptr, <8 x i1> %mask, i3
}
declare void @llvm.lifetime.start.p0(i64, ptr)
-define void @test_lifetime_start(i64 %arg0, ptr %ptr) {
+define void @test_lifetime_start(i64 %arg0) {
; CHECK: immarg operand has non-immediate parameter
; CHECK-NEXT: i64 %arg0
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 %arg0, ptr %ptr)
+ %ptr = alloca i64
call void @llvm.lifetime.start.p0(i64 %arg0, ptr %ptr)
ret void
}
declare void @llvm.lifetime.end.p0(i64, ptr)
-define void @test_lifetime_end(i64 %arg0, ptr %ptr) {
+define void @test_lifetime_end(i64 %arg0) {
; CHECK: immarg operand has non-immediate parameter
; CHECK-NEXT: i64 %arg0
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 %arg0, ptr %ptr)
+ %ptr = alloca i64
call void @llvm.lifetime.end.p0(i64 %arg0, ptr %ptr)
ret void
}
diff --git a/llvm/test/Verifier/opaque-ptr.ll b/llvm/test/Verifier/opaque-ptr.ll
index 1f29000..10e43a4 100644
--- a/llvm/test/Verifier/opaque-ptr.ll
+++ b/llvm/test/Verifier/opaque-ptr.ll
@@ -37,12 +37,14 @@ define void @atomicrmw(ptr %a, i32 %i) {
ret void
}
-define void @opaque_mangle(ptr %a) {
+define void @opaque_mangle() {
; CHECK-LABEL: @opaque_mangle(
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[A:%.*]])
+; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[A]])
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[A]])
; CHECK-NEXT: ret void
;
+ %a = alloca i64
call void @llvm.lifetime.start.p0(i64 8, ptr %a)
call void @llvm.lifetime.end.p0(i64 8, ptr %a)
ret void
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips64_eh.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips64_eh.ll.expected
index 897209a..56058bb 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips64_eh.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips64_eh.ll.expected
@@ -8,17 +8,17 @@ define i32 @main() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset 31, -8
-; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: .Ltmp0: # EH_LABEL
; CHECK-NEXT: jal foo
; CHECK-NEXT: nop
-; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: .Ltmp1: # EH_LABEL
; CHECK-NEXT: # %bb.1: # %good
; CHECK-NEXT: addiu $2, $zero, 5
; CHECK-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
; CHECK-NEXT: jr $ra
; CHECK-NEXT: daddiu $sp, $sp, 16
; CHECK-NEXT: .LBB0_2: # %bad
-; CHECK-NEXT: .Ltmp2:
+; CHECK-NEXT: .Ltmp2: # EH_LABEL
; CHECK-NEXT: jal _Unwind_Resume
; CHECK-NEXT: nop
%1 = invoke i32 @foo() to label %good unwind label %bad
diff --git a/llvm/test/tools/llvm-exegesis/RISCV/set-reg-init-check.s b/llvm/test/tools/llvm-exegesis/RISCV/set-reg-init-check.s
new file mode 100644
index 0000000..153e86a
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/RISCV/set-reg-init-check.s
@@ -0,0 +1,7 @@
+# RUN: llvm-exegesis -mode=latency -mtriple=riscv32-unknown-linux-gnu --mcpu=generic --dump-object-to-disk=%d --benchmark-phase=assemble-measured-code --opcode-name=FADD_D -mattr="+d" 2>&1
+# RUN: llvm-objdump -M numeric -d %d > %t.s
+# RUN: FileCheck %s < %t.s
+
+CHECK: <foo>:
+CHECK: li x30, 0x0
+CHECK-NEXT: fcvt.d.w f{{[0-9]|[12][0-9]|3[01]}}, x30
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s
index c7755dc..5cf5ed5 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s
@@ -2322,685 +2322,685 @@ vwsub.wx v8, v16, x30
# CHECK: [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VI vadd.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADD_VI vadd.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VV vadd.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADD_VV vadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADD_VX vadd.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADD_VX vadd.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VV vsub.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSUB_VV vsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSUB_VX vsub.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSUB_VX vsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VVM vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADC_VVM vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VXM vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADC_VXM vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VADC_VIM vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VADC_VIM vadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VVM vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSBC_VVM vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSBC_VXM vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSBC_VXM vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VV vwaddu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADDU_VV vwaddu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_VX vwaddu.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADDU_VX vwaddu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VV vwadd.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADD_VV vwadd.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_VX vwadd.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADD_VX vwadd.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VV vwsubu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUBU_VV vwsubu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_VX vwsubu.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUBU_VX vwsubu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VV vwsub.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUB_VV vwsub.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_VX vwsub.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUB_VX vwsub.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAADDU_VV vaaddu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
@@ -3354,533 +3354,533 @@ vwsub.wx v8, v16, x30
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VASUB_VX vasub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VI vmadc.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VI vmadc.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VIM vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VIM vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VV vmadc.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VV vmadc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VVM vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VVM vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VX vmadc.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VX vmadc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADC_VXM vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMADC_VXM vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VV vmsbc.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VV vmsbc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VVM vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VX vmsbc.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VX vmsbc.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSBC_VXM vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VI vrsub.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VRSUB_VI vrsub.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRSUB_VX vrsub.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VRSUB_VX vrsub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSADDU_VI vsaddu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
@@ -4322,245 +4322,245 @@ vwsub.wx v8, v16, x30
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSSUB_VX vssub.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WV vwaddu.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADDU_WV vwaddu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADDU_WX vwaddu.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADDU_WX vwaddu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WV vwadd.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADD_WV vwadd.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWADD_WX vwadd.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWADD_WX vwadd.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WV vwsubu.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUBU_WV vwsubu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUBU_WX vwsubu.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUBU_WX vwsubu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WV vwsub.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUB_WV vwsub.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWSUB_WX vwsub.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWSUB_WX vwsub.wx v8, v16, t5
# CHECK: Resources:
# CHECK-NEXT: [0] - SMX60_FP
@@ -4574,690 +4574,690 @@ vwsub.wx v8, v16, x30
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6]
-# CHECK-NEXT: - 1120.00 - - - - 1120.00 -
+# CHECK-NEXT: - 1120.00 - - - - 3292.00 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions:
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadd.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vadd.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vaaddu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
@@ -5611,533 +5611,533 @@ vwsub.wx v8, v16, x30
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vasub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmadc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsbc.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmsbc.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrsub.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vrsub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vsaddu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
@@ -6579,242 +6579,242 @@ vwsub.wx v8, v16, x30
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vssub.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwaddu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwaddu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwadd.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwadd.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsubu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsubu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwsub.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwsub.wx v8, v16, t5
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s
index 0b5dd60..89d3872 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s
@@ -1478,1157 +1478,1157 @@ vssrl.vx v8, v8, x30
# CHECK: [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VV vand.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VAND_VV vand.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VX vand.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VAND_VX vand.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VAND_VI vand.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VAND_VI vand.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VV vor.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VOR_VV vor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VX vor.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VOR_VX vor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VOR_VI vor.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VOR_VI vor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VV vxor.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VXOR_VV vxor.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VX vxor.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VXOR_VX vxor.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VXOR_VI vxor.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VXOR_VI vxor.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WV vnsra.wv v8, v16, v24
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRA_WV vnsra.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WX vnsra.wx v8, v16, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRA_WX vnsra.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRA_WI vnsra.wi v8, v16, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRA_WI vnsra.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WV vnsrl.wv v8, v16, v24
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRL_WV vnsrl.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WX vnsrl.wx v8, v16, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRL_WX vnsrl.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNSRL_WI vnsrl.wi v8, v16, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNSRL_WI vnsrl.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WI vnclipu.wi v8, v16, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIPU_WI vnclipu.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WV vnclipu.wv v8, v16, v24
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIPU_WV vnclipu.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIPU_WX vnclipu.wx v8, v16, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIPU_WX vnclipu.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WI vnclip.wi v8, v16, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIP_WI vnclip.wi v8, v16, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WV vnclip.wv v8, v16, v24
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIP_WV vnclip.wv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNCLIP_WX vnclip.wx v8, v16, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VNCLIP_WX vnclip.wx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VI vsll.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSLL_VI vsll.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VV vsll.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSLL_VV vsll.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSLL_VX vsll.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSLL_VX vsll.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VI vsra.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRA_VI vsra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VV vsra.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRA_VV vsra.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRA_VX vsra.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRA_VX vsra.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VI vsrl.vi v8, v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRL_VI vsrl.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VV vsrl.vv v8, v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRL_VV vsrl.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSRL_VX vsrl.vx v8, v8, t5
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSRL_VX vsrl.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSSRA_VI vssra.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
@@ -2906,1162 +2906,1162 @@ vssrl.vx v8, v8, x30
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6]
-# CHECK-NEXT: - 708.00 - - - - 708.00 -
+# CHECK-NEXT: - 708.00 - - - - 2436.00 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions:
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vand.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vand.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vand.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vand.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vxor.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vxor.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsra.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsra.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnsrl.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnsrl.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclipu.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclipu.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wi v8, v16, 12
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wi v8, v16, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnclip.wx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vnclip.wx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsll.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsll.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsra.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsra.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsrl.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vsrl.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vssra.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-comparison.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-comparison.s
index e381b45..f0247e4 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-comparison.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-comparison.s
@@ -926,885 +926,885 @@ vmslt.vx v8, v8, x30
# CHECK: [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VV vmseq.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSEQ_VV vmseq.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VX vmseq.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSEQ_VX vmseq.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSEQ_VI vmseq.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSEQ_VI vmseq.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VV vmsle.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLE_VV vmsle.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VX vmsle.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLE_VX vmsle.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLE_VI vmsle.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLE_VI vmsle.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VV vmsleu.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLEU_VV vmsleu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VX vmsleu.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLEU_VX vmsleu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLEU_VI vmsleu.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLEU_VI vmsleu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VV vmsne.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSNE_VV vmsne.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VX vmsne.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSNE_VX vmsne.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSNE_VI vmsne.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSNE_VI vmsne.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VI vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGTU_VI vmsgtu.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGTU_VX vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGTU_VX vmsgtu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VI vmsgt.vi v8, v8, 12
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGT_VI vmsgt.vi v8, v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSGT_VX vmsgt.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSGT_VX vmsgt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VV vmsltu.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLTU_VV vmsltu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLTU_VX vmsltu.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLTU_VX vmsltu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VV vmslt.vv v8, v8, v8
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLT_VV vmslt.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 6 4.00 6 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 10 4.00 10 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMSLT_VX vmslt.vx v8, v8, t5
+# CHECK-NEXT: 1 18 4.00 18 SMX60_VIEU[4] VMSLT_VX vmslt.vx v8, v8, t5
# CHECK: Resources:
# CHECK-NEXT: [0] - SMX60_FP
@@ -1818,887 +1818,887 @@ vmslt.vx v8, v8, x30
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6]
-# CHECK-NEXT: - 440.00 - - - - 440.00 -
+# CHECK-NEXT: - 440.00 - - - - 1760.00 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions:
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmseq.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmseq.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsle.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsle.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsleu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsleu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsne.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsne.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgtu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgtu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vi v8, v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vi v8, v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsgt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsgt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmsltu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmsltu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmslt.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmslt.vx v8, v8, t5
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-conversion.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-conversion.s
index ca6e9d1..9592d1b 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-conversion.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-conversion.s
@@ -615,117 +615,117 @@ vfwcvt.xu.f.v v8, v16
# CHECK: [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF2 vsext.vf2 v8, v16
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSEXT_VF2 vsext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF2 vzext.vf2 v8, v16
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VZEXT_VF2 vzext.vf2 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF4 vsext.vf4 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF4 vsext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF4 vsext.vf4 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF4 vsext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF4 vsext.vf4 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF4 vsext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF4 vsext.vf4 v8, v16
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSEXT_VF4 vsext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF4 vsext.vf4 v8, v16
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSEXT_VF4 vsext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF4 vsext.vf4 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF4 vsext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF4 vsext.vf4 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF4 vsext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF4 vsext.vf4 v8, v16
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSEXT_VF4 vsext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF4 vsext.vf4 v8, v16
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSEXT_VF4 vsext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF4 vzext.vf4 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF4 vzext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF4 vzext.vf4 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF4 vzext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF4 vzext.vf4 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF4 vzext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF4 vzext.vf4 v8, v16
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VZEXT_VF4 vzext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF4 vzext.vf4 v8, v16
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VZEXT_VF4 vzext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF4 vzext.vf4 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF4 vzext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF4 vzext.vf4 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF4 vzext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF4 vzext.vf4 v8, v16
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VZEXT_VF4 vzext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF4 vzext.vf4 v8, v16
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VZEXT_VF4 vzext.vf4 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF8 vsext.vf8 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF8 vsext.vf8 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF8 vsext.vf8 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VSEXT_VF8 vsext.vf8 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF8 vsext.vf8 v8, v16
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VSEXT_VF8 vsext.vf8 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSEXT_VF8 vsext.vf8 v8, v16
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VSEXT_VF8 vsext.vf8 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF8 vzext.vf8 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF8 vzext.vf8 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF8 vzext.vf8 v8, v16
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VZEXT_VF8 vzext.vf8 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF8 vzext.vf8 v8, v16
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VZEXT_VF8 vzext.vf8 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VZEXT_VF8 vzext.vf8 v8, v16
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VZEXT_VF8 vzext.vf8 v8, v16
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFCVT_F_XU_V vfcvt.f.xu.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
@@ -1189,122 +1189,122 @@ vfwcvt.xu.f.v v8, v16
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6]
-# CHECK-NEXT: - 281.00 - - - 225.00 56.00 -
+# CHECK-NEXT: - 281.00 - - - 225.00 224.00 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions:
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf2 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf2 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf4 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf4 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf8 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf8 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf8 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf8 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf8 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf8 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vsext.vf8 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vsext.vf8 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf8 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf8 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf8 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf8 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf8 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf8 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vzext.vf8 v8, v16
+# CHECK-NEXT: - - - - - - 4.00 - vzext.vf8 v8, v16
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
# CHECK-NEXT: - - - - - 1.00 - - vfcvt.f.xu.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fma.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fma.s
index a3105c3..d8e0feb 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fma.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fma.s
@@ -755,567 +755,567 @@ vfwnmsac.vv v8, v16, v24
# CHECK: [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VV vmacc.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMACC_VV vmacc.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMACC_VX vmacc.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMACC_VX vmacc.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VV vmadd.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMADD_VV vmadd.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMADD_VX vmadd.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMADD_VX vmadd.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VV vnmsac.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSAC_VV vnmsac.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSAC_VX vnmsac.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSAC_VX vnmsac.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VV vnmsub.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSUB_VV vnmsub.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VNMSUB_VX vnmsub.vx v8, s0, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VNMSUB_VX vnmsub.vx v8, s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VV vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCU_VV vwmaccu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCU_VX vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCU_VX vwmaccu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VV vwmacc.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACC_VV vwmacc.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACC_VX vwmacc.vx v8, a6, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACC_VX vwmacc.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VV vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCSU_VV vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCSU_VX vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCSU_VX vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMACCUS_VX vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMACCUS_VX vwmaccus.vx v8, a6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWMACC_VF vfwmacc.vf v8, fa6, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
@@ -1473,572 +1473,572 @@ vfwnmsac.vv v8, v16, v24
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6]
-# CHECK-NEXT: - 353.00 - - - 72.00 281.00 -
+# CHECK-NEXT: - 353.00 - - - 72.00 1652.00 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions:
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmacc.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmacc.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmadd.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmadd.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsac.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsac.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vnmsub.vx v8, s0, v8
+# CHECK-NEXT: - - - - - - 7.00 - vnmsub.vx v8, s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmacc.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmacc.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccsu.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccsu.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmaccus.vx v8, a6, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmaccus.vx v8, a6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
# CHECK-NEXT: - - - - - 1.00 - - vfwmacc.vf v8, fa6, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-minmax.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-minmax.s
index 4cc496b..2b6f4ba 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-minmax.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-minmax.s
@@ -386,357 +386,357 @@ vminu.vx v8, v8, x30
# CHECK: [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VV vmax.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAX_VV vmax.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAX_VX vmax.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAX_VX vmax.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VV vmaxu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAXU_VV vmaxu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMAXU_VX vmaxu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMAXU_VX vmaxu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VV vmin.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMIN_VV vmin.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMIN_VX vmin.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMIN_VX vmin.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VV vminu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMINU_VV vminu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMINU_VX vminu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMINU_VX vminu.vx v8, v8, t5
# CHECK: Resources:
# CHECK-NEXT: [0] - SMX60_FP
@@ -750,359 +750,359 @@ vminu.vx v8, v8, x30
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6]
-# CHECK-NEXT: - 176.00 - - - - 176.00 -
+# CHECK-NEXT: - 176.00 - - - - 704.00 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions:
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmax.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmax.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmaxu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmaxu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmin.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vmin.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vminu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 4.00 - vminu.vx v8, v8, t5
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s
index 5faf262..572ebf2 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s
@@ -1022,889 +1022,889 @@ vsmul.vx v8, v8, x30
# CHECK: [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VV vmul.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMUL_VV vmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMUL_VX vmul.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMUL_VX vmul.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VV vdiv.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIV_VV vdiv.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIV_VX vdiv.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIV_VX vdiv.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VV vdivu.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIVU_VV vdivu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VDIVU_VX vdivu.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VDIVU_VX vdivu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VV vrem.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREM_VV vrem.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREM_VX vrem.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREM_VX vrem.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VV vremu.vv v8, v8, v8
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREMU_VV vremu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 12 12.00 12 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 48 12.00 48 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 96 12.00 96 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREMU_VX vremu.vx v8, v8, t5
+# CHECK-NEXT: 1 192 12.00 192 SMX60_VIEU[12] VREMU_VX vremu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VV vmulh.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULH_VV vmulh.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULH_VX vmulh.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULH_VX vmulh.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VV vmulhu.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHU_VV vmulhu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHU_VX vmulhu.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHU_VX vmulhu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VV vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHSU_VV vmulhsu.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 7 7.00 7 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 8 7.00 8 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 16 7.00 16 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMULHSU_VX vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: 1 32 7.00 32 SMX60_VIEU[7] VMULHSU_VX vmulhsu.vx v8, v8, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VV vwmul.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMUL_VV vwmul.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMUL_VX vwmul.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMUL_VX vwmul.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VV vwmulu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMULU_VV vwmulu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULU_VX vwmulu.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMULU_VX vwmulu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VV vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMULSU_VV vwmulsu.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 5 4.00 5 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWMULSU_VX vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VWMULSU_VX vwmulsu.vx v8, v16, t5
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VSMUL_VV vsmul.vv v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
@@ -2006,894 +2006,894 @@ vsmul.vx v8, v8, x30
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6]
-# CHECK-NEXT: - 486.00 - - - - 486.00 -
+# CHECK-NEXT: - 486.00 - - - - 3748.00 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions:
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmul.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmul.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdiv.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdiv.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vdivu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vdivu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vrem.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vrem.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vremu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 12.00 - vremu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulh.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulh.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vv v8, v8, v8
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmulhsu.vx v8, v8, t5
+# CHECK-NEXT: - - - - - - 7.00 - vmulhsu.vx v8, v8, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmul.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmul.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vv v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwmulsu.vx v8, v16, t5
+# CHECK-NEXT: - - - - - - 4.00 - vwmulsu.vx v8, v16, t5
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vsmul.vv v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s
index fa53c08..5ae0d43b 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s
@@ -1198,137 +1198,137 @@ vfslide1up.vf v8, v16, ft0
# CHECK: [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_V vmv.v.v v8, v8
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMV_V_V vmv.v.v v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_X vmv.v.x v8, s0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMV_V_X vmv.v.x v8, s0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_V_I vmv.v.i v8, 12
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMV_V_I vmv.v.i v8, 12
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMV_X_S vmv.x.s s0, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
@@ -2120,137 +2120,137 @@ vfslide1up.vf v8, v16, ft0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VIM vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMERGE_VIM vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VVM vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMERGE_VVM vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 4 4.00 4 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 8 4.00 8 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VMERGE_VXM vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: 1 16 4.00 16 SMX60_VIEU[4] VMERGE_VXM vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFMERGE_VFM vfmerge.vfm v8, v8, ft0, v0
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
@@ -2354,142 +2354,142 @@ vfslide1up.vf v8, v16, ft0
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6]
-# CHECK-NEXT: - 572.00 - - - 45.00 527.00 -
+# CHECK-NEXT: - 572.00 - - - 45.00 923.00 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions:
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.v v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.v v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.x v8, s0
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.x v8, s0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmv.v.i v8, 12
+# CHECK-NEXT: - - - - - - 4.00 - vmv.v.i v8, 12
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vmv.x.s s0, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
@@ -3281,137 +3281,137 @@ vfslide1up.vf v8, v16, ft0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vrgatherei16.vv v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vim v8, v8, 12, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vim v8, v8, 12, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vvm v8, v8, v8, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vvm v8, v8, v8, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vmerge.vxm v8, v8, t5, v0
+# CHECK-NEXT: - - - - - - 4.00 - vmerge.vxm v8, v8, t5, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
# CHECK-NEXT: - - - - - 1.00 - - vfmerge.vfm v8, v8, ft0, v0
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
diff --git a/llvm/test/tools/llvm-objdump/ELF/ARM/debug-vars-dwarf4.s b/llvm/test/tools/llvm-objdump/ELF/ARM/debug-vars-dwarf4.s
index 69b7489..085f258 100644
--- a/llvm/test/tools/llvm-objdump/ELF/ARM/debug-vars-dwarf4.s
+++ b/llvm/test/tools/llvm-objdump/ELF/ARM/debug-vars-dwarf4.s
@@ -15,10 +15,10 @@
## Check that passing the default value for --debug-vars-indent (52) makes no
## change to the output.
-# RUN: llvm-objdump %t.o -d --debug-vars --debug-vars-indent=52 | \
+# RUN: llvm-objdump %t.o -d --debug-vars --debug-indent=52 | \
# RUN: FileCheck %s --check-prefix=RAW --strict-whitespace
-# RUN: llvm-objdump %t.o -d --debug-vars --debug-vars-indent=30 | \
+# RUN: llvm-objdump %t.o -d --debug-vars --debug-indent=30 | \
# RUN: FileCheck %s --check-prefix=INDENT --strict-whitespace
# RUN: llvm-objdump %t.o -d --debug-vars --no-show-raw-insn | \
diff --git a/llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc b/llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc
new file mode 100644
index 0000000..a708bc0
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc
@@ -0,0 +1,10 @@
+int bar(int x, int y) {
+ int sum = x + y;
+ int mul = x * y;
+ return sum + mul;
+}
+
+int foo(int a, int b) {
+ int result = bar(a, b);
+ return result;
+}
diff --git a/llvm/test/tools/llvm-objdump/X86/debug-inlined-functions.s b/llvm/test/tools/llvm-objdump/X86/debug-inlined-functions.s
new file mode 100644
index 0000000..6ed3507
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/X86/debug-inlined-functions.s
@@ -0,0 +1,643 @@
+## Generated with this compile command, with the source code in Inputs/debug-inlined-functions.cc:
+## clang++ -g -c debug-inlined-functions.cc -O1 -S -o -
+
+# RUN: llvm-mc -triple=x86_64 %s -filetype=obj -o %t.o
+
+# RUN: llvm-objdump %t.o -d --debug-inlined-funcs=unicode | \
+# RUN: FileCheck %s --check-prefixes=UNICODE,UNICODE-MANGLED --strict-whitespace
+
+# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs | \
+# RUN: FileCheck %s --check-prefixes=UNICODE,UNICODE-DEMANGLED --strict-whitespace
+
+# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=unicode | \
+# RUN: FileCheck %s --check-prefixes=UNICODE,UNICODE-DEMANGLED --strict-whitespace
+
+# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=unicode --debug-indent=30 | \
+# RUN: FileCheck %s --check-prefix=UNICODE-DEMANGLED-INDENT --strict-whitespace
+
+# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=ascii | \
+# RUN: FileCheck %s --check-prefix=ASCII-DEMANGLED --strict-whitespace
+
+# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=limits-only | \
+# RUN: FileCheck %s --check-prefix=LIMITS-ONLY-DEMANGLED
+
+# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=unicode --debug-vars=unicode | \
+# RUN: FileCheck %s --check-prefix=DEBUG-DEMANGLED-ALL --strict-whitespace
+
+# UNICODE-MANGLED: 0000000000000000 <_Z3barii>:
+# UNICODE-DEMANGLED: 0000000000000000 <bar(int, int)>:
+# UNICODE-NEXT: 0: 8d 04 3e leal (%rsi,%rdi), %eax
+# UNICODE-NEXT: 3: 0f af f7 imull %edi, %esi
+# UNICODE-NEXT: 6: 01 f0 addl %esi, %eax
+# UNICODE-NEXT: 8: c3 retq
+# UNICODE-NEXT: 9: 0f 1f 80 00 00 00 00 nopl (%rax)
+# UNICODE-EMPTY:
+# UNICODE-MANGLED-NEXT: 0000000000000010 <_Z3fooii>:
+# UNICODE-DEMANGLED-NEXT: 0000000000000010 <foo(int, int)>:
+# UNICODE-MANGLED-NEXT: ┠─ _Z3barii = inlined into _Z3fooii
+# UNICODE-DEMANGLED-NEXT: ┠─ bar(int, int) = inlined into foo(int, int)
+# UNICODE-NEXT: 10: 8d 04 3e leal (%rsi,%rdi), %eax ┃
+# UNICODE-NEXT: 13: 0f af f7 imull %edi, %esi ┃
+# UNICODE-NEXT: 16: 01 f0 addl %esi, %eax â”»
+# UNICODE-NEXT: 18: c3 retq
+
+# UNICODE-DEMANGLED-INDENT: 0000000000000010 <foo(int, int)>:
+# UNICODE-DEMANGLED-INDENT-NEXT: ┠─ bar(int, int) = inlined into foo(int, int)
+# UNICODE-DEMANGLED-INDENT-NEXT: 10: 8d 04 3e leal (%rsi,%rdi), %eax ┃
+# UNICODE-DEMANGLED-INDENT-NEXT: 13: 0f af f7 imull %edi, %esi ┃
+# UNICODE-DEMANGLED-INDENT-NEXT: 16: 01 f0 addl %esi, %eax â”»
+# UNICODE-DEMANGLED-INDENT-NEXT: 18: c3 retq
+
+# ASCII-DEMANGLED: 0000000000000010 <foo(int, int)>:
+# ASCII-DEMANGLED-NEXT: |- bar(int, int) = inlined into foo(int, int)
+# ASCII-DEMANGLED-NEXT: 10: 8d 04 3e leal (%rsi,%rdi), %eax |
+# ASCII-DEMANGLED-NEXT: 13: 0f af f7 imull %edi, %esi |
+# ASCII-DEMANGLED-NEXT: 16: 01 f0 addl %esi, %eax v
+# ASCII-DEMANGLED-NEXT: 18: c3 retq
+
+# LIMITS-ONLY-DEMANGLED: 0000000000000010 <foo(int, int)>:
+# LIMITS-ONLY-DEMANGLED-NEXT: debug-inlined-functions.cc:8:16: bar(int, int) inlined into foo(int, int)
+# LIMITS-ONLY-DEMANGLED-NEXT: 10: 8d 04 3e leal (%rsi,%rdi), %eax
+# LIMITS-ONLY-DEMANGLED-NEXT: 13: 0f af f7 imull %edi, %esi
+# LIMITS-ONLY-DEMANGLED-NEXT: 16: 01 f0 addl %esi, %eax
+# LIMITS-ONLY-DEMANGLED-NEXT: debug-inlined-functions.cc:8:16: end of bar(int, int) inlined into foo(int, int)
+# LIMITS-ONLY-DEMANGLED-NEXT: 18: c3 retq
+
+# DEBUG-DEMANGLED-ALL: 0000000000000010 <foo(int, int)>:
+# DEBUG-DEMANGLED-ALL-NEXT: ┠─ a = RDI
+# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┠─ b = RSI
+# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┃ ┠─ bar(int, int) = inlined into foo(int, int)
+# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┃ ┃ ┠─ x = RDI
+# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┃ ┃ ┃ ┠─ y = RSI
+# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┃ ┃ ┃ ┃ ┌─ sum = RAX
+# DEBUG-DEMANGLED-ALL-NEXT: 10: 8d 04 3e leal (%rsi,%rdi), %eax ┃ ┃ ┃ ┃ ┃ ╈
+# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┃ ┃ ┃ ┃ ┃ ┌─ b = entry(RSI)
+# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┃ ┃ ┃ ┃ ┃ │ ┌─ mul = RSI
+# DEBUG-DEMANGLED-ALL-NEXT: 13: 0f af f7 imull %edi, %esi ┃ ┻ ┃ ┃ ┻ ┃ ╈ ╈
+# DEBUG-DEMANGLED-ALL-NEXT: ┃ ┌─ result = RAX
+# DEBUG-DEMANGLED-ALL-NEXT: 16: 01 f0 addl %esi, %eax ┃ ╈ ┻ ┻ ┻ ┃ ┃
+# DEBUG-DEMANGLED-ALL-NEXT: 18: c3 retq â”» â”» â”» â”»
+
+ .file "debug-inlined-functions.cc"
+ .text
+ .globl _Z3barii # -- Begin function _Z3barii
+ .p2align 4
+ .type _Z3barii,@function
+_Z3barii: # @_Z3barii
+.Lfunc_begin0:
+ .file 0 "debug-inlined-functions.cc" md5 0xf07b869ec4d0996589aa6856ae4e6c83
+ .cfi_startproc
+# %bb.0: # %entry
+ #DEBUG_VALUE: bar:x <- $edi
+ #DEBUG_VALUE: bar:y <- $esi
+ # kill: def $esi killed $esi def $rsi
+ # kill: def $edi killed $edi def $rdi
+ .loc 0 2 15 prologue_end # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:2:15
+ leal (%rsi,%rdi), %eax
+.Ltmp0:
+ #DEBUG_VALUE: bar:sum <- $eax
+ .loc 0 3 15 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:3:15
+ imull %edi, %esi
+.Ltmp1:
+ #DEBUG_VALUE: bar:y <- [DW_OP_LLVM_entry_value 1] $esi
+ #DEBUG_VALUE: bar:mul <- $esi
+ .loc 0 4 14 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:4:14
+ addl %esi, %eax
+.Ltmp2:
+ .loc 0 4 3 is_stmt 0 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:4:3
+ retq
+.Ltmp3:
+.Lfunc_end0:
+ .size _Z3barii, .Lfunc_end0-_Z3barii
+ .cfi_endproc
+ # -- End function
+ .globl _Z3fooii # -- Begin function _Z3fooii
+ .p2align 4
+ .type _Z3fooii,@function
+_Z3fooii: # @_Z3fooii
+.Lfunc_begin1:
+ .cfi_startproc
+# %bb.0: # %entry
+ #DEBUG_VALUE: foo:a <- $edi
+ #DEBUG_VALUE: foo:b <- $esi
+ #DEBUG_VALUE: bar:x <- $edi
+ #DEBUG_VALUE: bar:y <- $esi
+ # kill: def $esi killed $esi def $rsi
+ # kill: def $edi killed $edi def $rdi
+ .loc 0 2 15 prologue_end is_stmt 1 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:2:15 @[ llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:8:16 ]
+ leal (%rsi,%rdi), %eax
+.Ltmp4:
+ #DEBUG_VALUE: bar:sum <- $eax
+ .loc 0 3 15 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:3:15 @[ llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:8:16 ]
+ imull %edi, %esi
+.Ltmp5:
+ #DEBUG_VALUE: foo:b <- [DW_OP_LLVM_entry_value 1] $esi
+ #DEBUG_VALUE: bar:mul <- $esi
+ .loc 0 4 14 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:4:14 @[ llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:8:16 ]
+ addl %esi, %eax
+.Ltmp6:
+ #DEBUG_VALUE: foo:result <- $eax
+ .loc 0 9 3 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:9:3
+ retq
+.Ltmp7:
+.Lfunc_end1:
+ .size _Z3fooii, .Lfunc_end1-_Z3fooii
+ .cfi_endproc
+ # -- End function
+ .section .debug_loclists,"",@progbits
+ .long .Ldebug_list_header_end0-.Ldebug_list_header_start0 # Length
+.Ldebug_list_header_start0:
+ .short 5 # Version
+ .byte 8 # Address size
+ .byte 0 # Segment selector size
+ .long 8 # Offset entry count
+.Lloclists_table_base0:
+ .long .Ldebug_loc0-.Lloclists_table_base0
+ .long .Ldebug_loc1-.Lloclists_table_base0
+ .long .Ldebug_loc2-.Lloclists_table_base0
+ .long .Ldebug_loc3-.Lloclists_table_base0
+ .long .Ldebug_loc4-.Lloclists_table_base0
+ .long .Ldebug_loc5-.Lloclists_table_base0
+ .long .Ldebug_loc6-.Lloclists_table_base0
+ .long .Ldebug_loc7-.Lloclists_table_base0
+.Ldebug_loc0:
+ .byte 4 # DW_LLE_offset_pair
+ .uleb128 .Lfunc_begin0-.Lfunc_begin0 # starting offset
+ .uleb128 .Ltmp1-.Lfunc_begin0 # ending offset
+ .byte 1 # Loc expr size
+ .byte 84 # super-register DW_OP_reg4
+ .byte 4 # DW_LLE_offset_pair
+ .uleb128 .Ltmp1-.Lfunc_begin0 # starting offset
+ .uleb128 .Lfunc_end0-.Lfunc_begin0 # ending offset
+ .byte 4 # Loc expr size
+ .byte 163 # DW_OP_entry_value
+ .byte 1 # 1
+ .byte 84 # super-register DW_OP_reg4
+ .byte 159 # DW_OP_stack_value
+ .byte 0 # DW_LLE_end_of_list
+.Ldebug_loc1:
+ .byte 4 # DW_LLE_offset_pair
+ .uleb128 .Ltmp0-.Lfunc_begin0 # starting offset
+ .uleb128 .Ltmp2-.Lfunc_begin0 # ending offset
+ .byte 1 # Loc expr size
+ .byte 80 # super-register DW_OP_reg0
+ .byte 0 # DW_LLE_end_of_list
+.Ldebug_loc2:
+ .byte 4 # DW_LLE_offset_pair
+ .uleb128 .Ltmp1-.Lfunc_begin0 # starting offset
+ .uleb128 .Lfunc_end0-.Lfunc_begin0 # ending offset
+ .byte 1 # Loc expr size
+ .byte 84 # super-register DW_OP_reg4
+ .byte 0 # DW_LLE_end_of_list
+.Ldebug_loc3:
+ .byte 4 # DW_LLE_offset_pair
+ .uleb128 .Lfunc_begin1-.Lfunc_begin0 # starting offset
+ .uleb128 .Ltmp5-.Lfunc_begin0 # ending offset
+ .byte 1 # Loc expr size
+ .byte 84 # super-register DW_OP_reg4
+ .byte 4 # DW_LLE_offset_pair
+ .uleb128 .Ltmp5-.Lfunc_begin0 # starting offset
+ .uleb128 .Lfunc_end1-.Lfunc_begin0 # ending offset
+ .byte 4 # Loc expr size
+ .byte 163 # DW_OP_entry_value
+ .byte 1 # 1
+ .byte 84 # super-register DW_OP_reg4
+ .byte 159 # DW_OP_stack_value
+ .byte 0 # DW_LLE_end_of_list
+.Ldebug_loc4:
+ .byte 4 # DW_LLE_offset_pair
+ .uleb128 .Lfunc_begin1-.Lfunc_begin0 # starting offset
+ .uleb128 .Ltmp5-.Lfunc_begin0 # ending offset
+ .byte 1 # Loc expr size
+ .byte 84 # super-register DW_OP_reg4
+ .byte 0 # DW_LLE_end_of_list
+.Ldebug_loc5:
+ .byte 4 # DW_LLE_offset_pair
+ .uleb128 .Ltmp4-.Lfunc_begin0 # starting offset
+ .uleb128 .Ltmp6-.Lfunc_begin0 # ending offset
+ .byte 1 # Loc expr size
+ .byte 80 # super-register DW_OP_reg0
+ .byte 0 # DW_LLE_end_of_list
+.Ldebug_loc6:
+ .byte 4 # DW_LLE_offset_pair
+ .uleb128 .Ltmp5-.Lfunc_begin0 # starting offset
+ .uleb128 .Lfunc_end1-.Lfunc_begin0 # ending offset
+ .byte 1 # Loc expr size
+ .byte 84 # super-register DW_OP_reg4
+ .byte 0 # DW_LLE_end_of_list
+.Ldebug_loc7:
+ .byte 4 # DW_LLE_offset_pair
+ .uleb128 .Ltmp6-.Lfunc_begin0 # starting offset
+ .uleb128 .Lfunc_end1-.Lfunc_begin0 # ending offset
+ .byte 1 # Loc expr size
+ .byte 80 # super-register DW_OP_reg0
+ .byte 0 # DW_LLE_end_of_list
+.Ldebug_list_header_end0:
+ .section .debug_abbrev,"",@progbits
+ .byte 1 # Abbreviation Code
+ .byte 17 # DW_TAG_compile_unit
+ .byte 1 # DW_CHILDREN_yes
+ .byte 37 # DW_AT_producer
+ .byte 37 # DW_FORM_strx1
+ .byte 19 # DW_AT_language
+ .byte 5 # DW_FORM_data2
+ .byte 3 # DW_AT_name
+ .byte 37 # DW_FORM_strx1
+ .byte 114 # DW_AT_str_offsets_base
+ .byte 23 # DW_FORM_sec_offset
+ .byte 16 # DW_AT_stmt_list
+ .byte 23 # DW_FORM_sec_offset
+ .byte 27 # DW_AT_comp_dir
+ .byte 37 # DW_FORM_strx1
+ .byte 17 # DW_AT_low_pc
+ .byte 27 # DW_FORM_addrx
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 115 # DW_AT_addr_base
+ .byte 23 # DW_FORM_sec_offset
+ .ascii "\214\001" # DW_AT_loclists_base
+ .byte 23 # DW_FORM_sec_offset
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 2 # Abbreviation Code
+ .byte 46 # DW_TAG_subprogram
+ .byte 1 # DW_CHILDREN_yes
+ .byte 17 # DW_AT_low_pc
+ .byte 27 # DW_FORM_addrx
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 64 # DW_AT_frame_base
+ .byte 24 # DW_FORM_exprloc
+ .byte 122 # DW_AT_call_all_calls
+ .byte 25 # DW_FORM_flag_present
+ .byte 49 # DW_AT_abstract_origin
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 3 # Abbreviation Code
+ .byte 5 # DW_TAG_formal_parameter
+ .byte 0 # DW_CHILDREN_no
+ .byte 2 # DW_AT_location
+ .byte 24 # DW_FORM_exprloc
+ .byte 49 # DW_AT_abstract_origin
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 4 # Abbreviation Code
+ .byte 5 # DW_TAG_formal_parameter
+ .byte 0 # DW_CHILDREN_no
+ .byte 2 # DW_AT_location
+ .byte 34 # DW_FORM_loclistx
+ .byte 49 # DW_AT_abstract_origin
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 5 # Abbreviation Code
+ .byte 52 # DW_TAG_variable
+ .byte 0 # DW_CHILDREN_no
+ .byte 2 # DW_AT_location
+ .byte 34 # DW_FORM_loclistx
+ .byte 49 # DW_AT_abstract_origin
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 6 # Abbreviation Code
+ .byte 46 # DW_TAG_subprogram
+ .byte 1 # DW_CHILDREN_yes
+ .byte 110 # DW_AT_linkage_name
+ .byte 37 # DW_FORM_strx1
+ .byte 3 # DW_AT_name
+ .byte 37 # DW_FORM_strx1
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 63 # DW_AT_external
+ .byte 25 # DW_FORM_flag_present
+ .byte 32 # DW_AT_inline
+ .byte 33 # DW_FORM_implicit_const
+ .byte 1
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 7 # Abbreviation Code
+ .byte 5 # DW_TAG_formal_parameter
+ .byte 0 # DW_CHILDREN_no
+ .byte 3 # DW_AT_name
+ .byte 37 # DW_FORM_strx1
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 8 # Abbreviation Code
+ .byte 52 # DW_TAG_variable
+ .byte 0 # DW_CHILDREN_no
+ .byte 3 # DW_AT_name
+ .byte 37 # DW_FORM_strx1
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 9 # Abbreviation Code
+ .byte 36 # DW_TAG_base_type
+ .byte 0 # DW_CHILDREN_no
+ .byte 3 # DW_AT_name
+ .byte 37 # DW_FORM_strx1
+ .byte 62 # DW_AT_encoding
+ .byte 11 # DW_FORM_data1
+ .byte 11 # DW_AT_byte_size
+ .byte 11 # DW_FORM_data1
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 10 # Abbreviation Code
+ .byte 46 # DW_TAG_subprogram
+ .byte 1 # DW_CHILDREN_yes
+ .byte 17 # DW_AT_low_pc
+ .byte 27 # DW_FORM_addrx
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 64 # DW_AT_frame_base
+ .byte 24 # DW_FORM_exprloc
+ .byte 122 # DW_AT_call_all_calls
+ .byte 25 # DW_FORM_flag_present
+ .byte 110 # DW_AT_linkage_name
+ .byte 37 # DW_FORM_strx1
+ .byte 3 # DW_AT_name
+ .byte 37 # DW_FORM_strx1
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 63 # DW_AT_external
+ .byte 25 # DW_FORM_flag_present
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 11 # Abbreviation Code
+ .byte 5 # DW_TAG_formal_parameter
+ .byte 0 # DW_CHILDREN_no
+ .byte 2 # DW_AT_location
+ .byte 24 # DW_FORM_exprloc
+ .byte 3 # DW_AT_name
+ .byte 37 # DW_FORM_strx1
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 12 # Abbreviation Code
+ .byte 5 # DW_TAG_formal_parameter
+ .byte 0 # DW_CHILDREN_no
+ .byte 2 # DW_AT_location
+ .byte 34 # DW_FORM_loclistx
+ .byte 3 # DW_AT_name
+ .byte 37 # DW_FORM_strx1
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 13 # Abbreviation Code
+ .byte 52 # DW_TAG_variable
+ .byte 0 # DW_CHILDREN_no
+ .byte 2 # DW_AT_location
+ .byte 34 # DW_FORM_loclistx
+ .byte 3 # DW_AT_name
+ .byte 37 # DW_FORM_strx1
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 14 # Abbreviation Code
+ .byte 29 # DW_TAG_inlined_subroutine
+ .byte 1 # DW_CHILDREN_yes
+ .byte 49 # DW_AT_abstract_origin
+ .byte 19 # DW_FORM_ref4
+ .byte 17 # DW_AT_low_pc
+ .byte 27 # DW_FORM_addrx
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 88 # DW_AT_call_file
+ .byte 11 # DW_FORM_data1
+ .byte 89 # DW_AT_call_line
+ .byte 11 # DW_FORM_data1
+ .byte 87 # DW_AT_call_column
+ .byte 11 # DW_FORM_data1
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 0 # EOM(3)
+ .section .debug_info,"",@progbits
+.Lcu_begin0:
+ .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+ .short 5 # DWARF version number
+ .byte 1 # DWARF Unit Type
+ .byte 8 # Address Size (in bytes)
+ .long .debug_abbrev # Offset Into Abbrev. Section
+ .byte 1 # Abbrev [1] 0xc:0xc4 DW_TAG_compile_unit
+ .byte 0 # DW_AT_producer
+ .short 33 # DW_AT_language
+ .byte 1 # DW_AT_name
+ .long .Lstr_offsets_base0 # DW_AT_str_offsets_base
+ .long .Lline_table_start0 # DW_AT_stmt_list
+ .byte 2 # DW_AT_comp_dir
+ .byte 0 # DW_AT_low_pc
+ .long .Lfunc_end1-.Lfunc_begin0 # DW_AT_high_pc
+ .long .Laddr_table_base0 # DW_AT_addr_base
+ .long .Lloclists_table_base0 # DW_AT_loclists_base
+ .byte 2 # Abbrev [2] 0x27:0x26 DW_TAG_subprogram
+ .byte 0 # DW_AT_low_pc
+ .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc
+ .byte 1 # DW_AT_frame_base
+ .byte 87
+ # DW_AT_call_all_calls
+ .long 77 # DW_AT_abstract_origin
+ .byte 3 # Abbrev [3] 0x33:0x7 DW_TAG_formal_parameter
+ .byte 1 # DW_AT_location
+ .byte 85
+ .long 86 # DW_AT_abstract_origin
+ .byte 4 # Abbrev [4] 0x3a:0x6 DW_TAG_formal_parameter
+ .byte 0 # DW_AT_location
+ .long 94 # DW_AT_abstract_origin
+ .byte 5 # Abbrev [5] 0x40:0x6 DW_TAG_variable
+ .byte 1 # DW_AT_location
+ .long 102 # DW_AT_abstract_origin
+ .byte 5 # Abbrev [5] 0x46:0x6 DW_TAG_variable
+ .byte 2 # DW_AT_location
+ .long 110 # DW_AT_abstract_origin
+ .byte 0 # End Of Children Mark
+ .byte 6 # Abbrev [6] 0x4d:0x2a DW_TAG_subprogram
+ .byte 3 # DW_AT_linkage_name
+ .byte 4 # DW_AT_name
+ .byte 0 # DW_AT_decl_file
+ .byte 1 # DW_AT_decl_line
+ .long 119 # DW_AT_type
+ # DW_AT_external
+ # DW_AT_inline
+ .byte 7 # Abbrev [7] 0x56:0x8 DW_TAG_formal_parameter
+ .byte 6 # DW_AT_name
+ .byte 0 # DW_AT_decl_file
+ .byte 1 # DW_AT_decl_line
+ .long 119 # DW_AT_type
+ .byte 7 # Abbrev [7] 0x5e:0x8 DW_TAG_formal_parameter
+ .byte 7 # DW_AT_name
+ .byte 0 # DW_AT_decl_file
+ .byte 1 # DW_AT_decl_line
+ .long 119 # DW_AT_type
+ .byte 8 # Abbrev [8] 0x66:0x8 DW_TAG_variable
+ .byte 8 # DW_AT_name
+ .byte 0 # DW_AT_decl_file
+ .byte 2 # DW_AT_decl_line
+ .long 119 # DW_AT_type
+ .byte 8 # Abbrev [8] 0x6e:0x8 DW_TAG_variable
+ .byte 9 # DW_AT_name
+ .byte 0 # DW_AT_decl_file
+ .byte 3 # DW_AT_decl_line
+ .long 119 # DW_AT_type
+ .byte 0 # End Of Children Mark
+ .byte 9 # Abbrev [9] 0x77:0x4 DW_TAG_base_type
+ .byte 5 # DW_AT_name
+ .byte 5 # DW_AT_encoding
+ .byte 4 # DW_AT_byte_size
+ .byte 10 # Abbrev [10] 0x7b:0x54 DW_TAG_subprogram
+ .byte 1 # DW_AT_low_pc
+ .long .Lfunc_end1-.Lfunc_begin1 # DW_AT_high_pc
+ .byte 1 # DW_AT_frame_base
+ .byte 87
+ # DW_AT_call_all_calls
+ .byte 10 # DW_AT_linkage_name
+ .byte 11 # DW_AT_name
+ .byte 0 # DW_AT_decl_file
+ .byte 7 # DW_AT_decl_line
+ .long 119 # DW_AT_type
+ # DW_AT_external
+ .byte 11 # Abbrev [11] 0x8b:0xa DW_TAG_formal_parameter
+ .byte 1 # DW_AT_location
+ .byte 85
+ .byte 12 # DW_AT_name
+ .byte 0 # DW_AT_decl_file
+ .byte 7 # DW_AT_decl_line
+ .long 119 # DW_AT_type
+ .byte 12 # Abbrev [12] 0x95:0x9 DW_TAG_formal_parameter
+ .byte 3 # DW_AT_location
+ .byte 13 # DW_AT_name
+ .byte 0 # DW_AT_decl_file
+ .byte 7 # DW_AT_decl_line
+ .long 119 # DW_AT_type
+ .byte 13 # Abbrev [13] 0x9e:0x9 DW_TAG_variable
+ .byte 7 # DW_AT_location
+ .byte 14 # DW_AT_name
+ .byte 0 # DW_AT_decl_file
+ .byte 8 # DW_AT_decl_line
+ .long 119 # DW_AT_type
+ .byte 14 # Abbrev [14] 0xa7:0x27 DW_TAG_inlined_subroutine
+ .long 77 # DW_AT_abstract_origin
+ .byte 1 # DW_AT_low_pc
+ .long .Ltmp6-.Lfunc_begin1 # DW_AT_high_pc
+ .byte 0 # DW_AT_call_file
+ .byte 8 # DW_AT_call_line
+ .byte 16 # DW_AT_call_column
+ .byte 3 # Abbrev [3] 0xb4:0x7 DW_TAG_formal_parameter
+ .byte 1 # DW_AT_location
+ .byte 85
+ .long 86 # DW_AT_abstract_origin
+ .byte 4 # Abbrev [4] 0xbb:0x6 DW_TAG_formal_parameter
+ .byte 4 # DW_AT_location
+ .long 94 # DW_AT_abstract_origin
+ .byte 5 # Abbrev [5] 0xc1:0x6 DW_TAG_variable
+ .byte 5 # DW_AT_location
+ .long 102 # DW_AT_abstract_origin
+ .byte 5 # Abbrev [5] 0xc7:0x6 DW_TAG_variable
+ .byte 6 # DW_AT_location
+ .long 110 # DW_AT_abstract_origin
+ .byte 0 # End Of Children Mark
+ .byte 0 # End Of Children Mark
+ .byte 0 # End Of Children Mark
+.Ldebug_info_end0:
+ .section .debug_str_offsets,"",@progbits
+ .long 64 # Length of String Offsets Set
+ .short 5
+ .short 0
+.Lstr_offsets_base0:
+ .section .debug_str,"MS",@progbits,1
+.Linfo_string0:
+ .asciz "clang version 21.0.0git (git@github.com:llvm/llvm-project.git eed98e1493414ae9c30596b1eeb8f4a9b260e42)" # string offset=0
+.Linfo_string1:
+ .asciz "llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc" # string offset=112
+.Linfo_string2:
+ .asciz "llvm-project" # string offset=179
+.Linfo_string3:
+ .asciz "_Z3barii" # string offset=229
+.Linfo_string4:
+ .asciz "bar" # string offset=238
+.Linfo_string5:
+ .asciz "int" # string offset=242
+.Linfo_string6:
+ .asciz "x" # string offset=246
+.Linfo_string7:
+ .asciz "y" # string offset=248
+.Linfo_string8:
+ .asciz "sum" # string offset=250
+.Linfo_string9:
+ .asciz "mul" # string offset=254
+.Linfo_string10:
+ .asciz "_Z3fooii" # string offset=258
+.Linfo_string11:
+ .asciz "foo" # string offset=267
+.Linfo_string12:
+ .asciz "a" # string offset=271
+.Linfo_string13:
+ .asciz "b" # string offset=273
+.Linfo_string14:
+ .asciz "result" # string offset=275
+ .section .debug_str_offsets,"",@progbits
+ .long .Linfo_string0
+ .long .Linfo_string1
+ .long .Linfo_string2
+ .long .Linfo_string3
+ .long .Linfo_string4
+ .long .Linfo_string5
+ .long .Linfo_string6
+ .long .Linfo_string7
+ .long .Linfo_string8
+ .long .Linfo_string9
+ .long .Linfo_string10
+ .long .Linfo_string11
+ .long .Linfo_string12
+ .long .Linfo_string13
+ .long .Linfo_string14
+ .section .debug_addr,"",@progbits
+ .long .Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+ .short 5 # DWARF version number
+ .byte 8 # Address size
+ .byte 0 # Segment selector size
+.Laddr_table_base0:
+ .quad .Lfunc_begin0
+ .quad .Lfunc_begin1
+.Ldebug_addr_end0:
+ .ident "clang version 21.0.0git (git@github.com:llvm/llvm-project.git eed98e1493414ae9c30596b1eeb8f4a9b260e42a)"
+ .section ".note.GNU-stack","",@progbits
+ .addrsig
+ .section .debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/llvm/test/tools/llvm-readobj/ELF/sframe-header.test b/llvm/test/tools/llvm-readobj/ELF/sframe-header.test
new file mode 100644
index 0000000..f827296
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/sframe-header.test
@@ -0,0 +1,148 @@
+## Check parsing and dumping of the SFrame header.
+# RUN: yaml2obj --docnum=1 %s -o %t.1
+# RUN: llvm-readobj --sframe=.sframe_bad_sh_size --sframe=.sframe_1b \
+# RUN: --sframe=.sframe_bad_magic --sframe=.sframe_bad_version \
+# RUN: --sframe=.sframe_6b --sframe=.sframe_header %t.1 2>&1 | \
+# RUN: FileCheck %s --strict-whitespace --match-full-lines \
+# RUN: -DFILE=%t.1 --check-prefix=CASE1
+
+## Check big-endian support and the handling of --sframe argument default.
+# RUN: yaml2obj --docnum=2 %s -o %t.2
+# RUN: llvm-readobj --sframe %t.2 2>&1 | \
+# RUN: FileCheck %s --strict-whitespace --match-full-lines \
+# RUN: -DFILE=%t.2 --check-prefix=CASE2
+
+## Check handling of corrupted elf files (bad sh_name)
+# RUN: yaml2obj --docnum=3 %s -o %t.3
+# RUN: not llvm-readobj --sframe %t.3 2>&1 | \
+# RUN: FileCheck %s --strict-whitespace --match-full-lines \
+# RUN: -DFILE=%t.3 --check-prefix=CASE3
+
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_EXEC
+Sections:
+ - Name: .sframe_bad_sh_size
+ Type: SHT_GNU_SFRAME
+ Flags: [ SHF_ALLOC ]
+ ShSize: 0xfffff
+# CASE1-LABEL:SFrame section '.sframe_bad_sh_size' {
+# CASE1:{{.*}}: warning: '[[FILE]]': The end of the file was unexpectedly encountered
+ - Name: .sframe_1b
+ Type: SHT_GNU_SFRAME
+ Flags: [ SHF_ALLOC ]
+ ContentArray: [ 0x00 ]
+# CASE1-LABEL:SFrame section '.sframe_1b' {
+# CASE1:{{.*}}: warning: '[[FILE]]': invalid sframe section: unexpected end of data at offset 0x1 while reading [0x0, 0x4)
+
+ - Name: .sframe_bad_magic
+ Type: SHT_GNU_SFRAME
+ Flags: [ SHF_ALLOC ]
+ ContentArray: [ 0xde, 0xad, 0xbe, 0xef]
+# CASE1-LABEL:SFrame section '.sframe_bad_magic' {
+# CASE1:{{.*}}: warning: '[[FILE]]': invalid sframe section: invalid magic number (0xadde)
+
+ - Name: .sframe_bad_version
+ Type: SHT_GNU_SFRAME
+ Flags: [ SHF_ALLOC ]
+ ContentArray: [
+ 0xe2, 0xde, 0x01, 0x00 # Preamble (magic, version, flags)
+ ]
+# CASE1-LABEL:SFrame section '.sframe_bad_version' {
+# CASE1:{{.*}}: warning: '[[FILE]]': invalid sframe section: invalid/unsupported version number (1)
+
+ - Name: .sframe_6b
+ Type: SHT_GNU_SFRAME
+ Flags: [ SHF_ALLOC ]
+ ContentArray: [
+ 0xe2, 0xde, 0x02, 0x00, # Preamble (magic, version, flags)
+ 0x01, 0x02
+ ]
+
+# CASE1-LABEL:SFrame section '.sframe_6b' {
+# CASE1:{{.*}}: warning: '[[FILE]]': invalid sframe section: unexpected end of data at offset 0x6 while reading [0x0, 0x1c)
+
+ - Name: .sframe_header
+ Type: SHT_GNU_SFRAME
+ Flags: [ SHF_ALLOC ]
+ ContentArray: [
+ 0xe2, 0xde, 0x02, 0x06, # Preamble (magic, version, flags)
+ # Header:
+ 0x03, 0x42, 0x47, 0x00, # ABI, Fixed FP offset, Fixed RA Offset, AUX header length
+ 0x01, 0x00, 0x00, 0x00, # Number of FDEs
+ 0x10, 0x00, 0x00, 0x00, # Number of FREs
+ 0x00, 0x10, 0x00, 0x00, # FRE length
+ 0x04, 0x00, 0x00, 0x00, # FDE offset
+ 0x00, 0x01, 0x00, 0x00, # FRE offset
+ ]
+# CASE1-LABEL:SFrame section '.sframe_header' {
+# CASE1: Header {
+# CASE1-NEXT: Magic: 0xDEE2
+# CASE1-NEXT: Version: V2 (0x2)
+# CASE1-NEXT: Flags [ (0x6)
+# CASE1-NEXT: FDEFuncStartPCRel (0x4){{ *}}
+# CASE1-NEXT: FramePointer (0x2){{ *}}
+# CASE1-NEXT: ]
+# CASE1-NEXT: ABI: AMD64EndianLittle (0x3)
+# CASE1-NEXT: CFA fixed FP offset (unused): 66
+# CASE1-NEXT: CFA fixed RA offset: 71
+# CASE1-NEXT: Auxiliary header length: 0
+# CASE1-NEXT: Num FDEs: 1
+# CASE1-NEXT: Num FREs: 16
+# CASE1-NEXT: FRE subsection length: 4096
+# CASE1-NEXT: FDE subsection offset: 4
+# CASE1-NEXT: FRE subsection offset: 256
+# CASE1-NEXT: }
+# CASE1-NEXT:}
+
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2MSB
+ Type: ET_EXEC
+Sections:
+ - Name: .sframe
+ Type: SHT_GNU_SFRAME
+ Flags: [ SHF_ALLOC ]
+ ContentArray: [
+ 0xde, 0xe2, 0x02, 0x01, # Preamble (magic, version, flags)
+ # Header:
+ 0x01, 0x42, 0x47, 0x00, # ABI, Fixed FP offset, Fixed RA Offset, AUX header length
+ 0x00, 0x00, 0x00, 0x01, # Number of FDEs
+ 0x00, 0x00, 0x00, 0x10, # Number of FREs
+ 0x00, 0x00, 0x10, 0x00, # FRE length
+ 0x00, 0x00, 0x00, 0x04, # FDE offset
+ 0x00, 0x00, 0x01, 0x00, # FRE offset
+ ]
+# CASE2-LABEL:SFrame section '.sframe' {
+# CASE2: Header {
+# CASE2-NEXT: Magic: 0xDEE2
+# CASE2-NEXT: Version: V2 (0x2)
+# CASE2-NEXT: Flags [ (0x1)
+# CASE2-NEXT: FDESorted (0x1){{ *}}
+# CASE2-NEXT: ]
+# CASE2-NEXT: ABI: AArch64EndianBig (0x1)
+# CASE2-NEXT: CFA fixed FP offset (unused): 66
+# CASE2-NEXT: CFA fixed RA offset (unused): 71
+# CASE2-NEXT: Auxiliary header length: 0
+# CASE2-NEXT: Num FDEs: 1
+# CASE2-NEXT: Num FREs: 16
+# CASE2-NEXT: FRE subsection length: 4096
+# CASE2-NEXT: FDE subsection offset: 4
+# CASE2-NEXT: FRE subsection offset: 256
+# CASE2-NEXT: }
+# CASE2-NEXT:}
+
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2MSB
+ Type: ET_EXEC
+Sections:
+ - Name: .corrupted
+ Type: SHT_GNU_SFRAME
+ Flags: [ SHF_ALLOC ]
+ ShName: 0x10000
+# CASE3:{{.*}}: error: '[[FILE]]': a section [index 1] has an invalid sh_name (0x10000) offset which goes past the end of the section name string table
diff --git a/llvm/tools/bugpoint/bugpoint.cpp b/llvm/tools/bugpoint/bugpoint.cpp
index e49efdf..87581e80a 100644
--- a/llvm/tools/bugpoint/bugpoint.cpp
+++ b/llvm/tools/bugpoint/bugpoint.cpp
@@ -22,6 +22,7 @@
#include "llvm/LinkAllIR.h"
#include "llvm/LinkAllPasses.h"
#include "llvm/Passes/PassPlugin.h"
+#include "llvm/Support/AlwaysTrue.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/InitLLVM.h"
#include "llvm/Support/PluginLoader.h"
@@ -111,7 +112,7 @@ int main(int argc, char **argv) {
initializeInstCombine(Registry);
initializeTarget(Registry);
- if (std::getenv("bar") == (char*) -1) {
+ if (!llvm::getNonFoldableAlwaysTrue()) {
InitializeAllTargets();
InitializeAllTargetMCs();
InitializeAllAsmPrinters();
diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp
index 676479b..ea830bd 100644
--- a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp
@@ -651,8 +651,10 @@ static std::vector<MCInst> loadFP64RegBits32(const MCSubtargetInfo &STI,
}
std::vector<MCInst> Instrs = loadIntReg(STI, ScratchIntReg, Bits);
- Instrs.push_back(
- MCInstBuilder(RISCV::FCVT_D_W).addReg(Reg).addReg(ScratchIntReg));
+ Instrs.push_back(MCInstBuilder(RISCV::FCVT_D_W)
+ .addReg(Reg)
+ .addReg(ScratchIntReg)
+ .addImm(RISCVFPRndMode::RNE));
return Instrs;
}
diff --git a/llvm/tools/llvm-objdump/ObjdumpOpts.td b/llvm/tools/llvm-objdump/ObjdumpOpts.td
index c3764c6..c97e06f 100644
--- a/llvm/tools/llvm-objdump/ObjdumpOpts.td
+++ b/llvm/tools/llvm-objdump/ObjdumpOpts.td
@@ -241,17 +241,23 @@ defm prefix_strip
"paths. No effect without --prefix">,
MetaVarName<"prefix">;
+def debug_indent_EQ : Joined<["--"], "debug-indent=">,
+ HelpText<"Distance to indent the source-level variable and inlined function display, "
+ "relative to the start of the disassembly">;
+
+def debug_inlined_funcs_EQ : Joined<["--"], "debug-inlined-funcs=">,
+ HelpText<"Print the locations of inlined functions alongside disassembly. "
+ "Supported formats: ascii, limits-only, and unicode (default)">,
+ Values<"ascii,limits-only,unicode">;
+def : Flag<["--"], "debug-inlined-funcs">, Alias<debug_inlined_funcs_EQ>, AliasArgs<["unicode"]>;
+
def debug_vars_EQ : Joined<["--"], "debug-vars=">,
HelpText<"Print the locations (in registers or memory) of "
"source-level variables alongside disassembly. "
"Supported formats: ascii, unicode (default)">,
- Values<"unicode,ascii">;
+ Values<"ascii,unicode">;
def : Flag<["--"], "debug-vars">, Alias<debug_vars_EQ>, AliasArgs<["unicode"]>;
-def debug_vars_indent_EQ : Joined<["--"], "debug-vars-indent=">,
- HelpText<"Distance to indent the source-level variable display, "
- "relative to the start of the disassembly">;
-
def x86_asm_syntax_att : Flag<["--"], "x86-asm-syntax=att">,
HelpText<"Emit AT&T-style disassembly">;
diff --git a/llvm/tools/llvm-objdump/SourcePrinter.cpp b/llvm/tools/llvm-objdump/SourcePrinter.cpp
index 3630502..b0ff89d 100644
--- a/llvm/tools/llvm-objdump/SourcePrinter.cpp
+++ b/llvm/tools/llvm-objdump/SourcePrinter.cpp
@@ -6,9 +6,9 @@
//
//===----------------------------------------------------------------------===//
//
-// This file implements the LiveVariablePrinter and SourcePrinter classes to
+// This file implements the LiveElementPrinter and SourcePrinter classes to
// keep track of DWARF info as the current address is updated, and print out the
-// source file line and variable liveness as needed.
+// source file line and variable or inlined function liveness as needed.
//
//===----------------------------------------------------------------------===//
@@ -17,6 +17,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/DebugInfo/DWARF/DWARFExpressionPrinter.h"
#include "llvm/DebugInfo/DWARF/LowLevel/DWARFExpression.h"
+#include "llvm/Demangle/Demangle.h"
#include "llvm/Support/FormatVariadic.h"
#define DEBUG_TYPE "objdump"
@@ -24,7 +25,70 @@
namespace llvm {
namespace objdump {
-bool LiveVariable::liveAtAddress(object::SectionedAddress Addr) {
+bool InlinedFunction::liveAtAddress(object::SectionedAddress Addr) const {
+ if (!Range.valid())
+ return false;
+
+ return Range.LowPC <= Addr.Address && Range.HighPC > Addr.Address;
+}
+
+void InlinedFunction::print(raw_ostream &OS, const MCRegisterInfo &MRI) const {
+ const char *MangledCallerName = FuncDie.getName(DINameKind::LinkageName);
+ if (!MangledCallerName)
+ return;
+
+ if (Demangle)
+ OS << "inlined into " << demangle(MangledCallerName);
+ else
+ OS << "inlined into " << MangledCallerName;
+}
+
+void InlinedFunction::dump(raw_ostream &OS) const {
+ OS << Name << " @ " << Range << ": ";
+}
+
+void InlinedFunction::printElementLine(raw_ostream &OS,
+ object::SectionedAddress Addr,
+ bool IsEnd) const {
+ bool LiveIn = !IsEnd && Range.LowPC == Addr.Address;
+ bool LiveOut = IsEnd && Range.HighPC == Addr.Address;
+ if (!(LiveIn || LiveOut))
+ return;
+
+ uint32_t CallFile, CallLine, CallColumn, CallDiscriminator;
+ InlinedFuncDie.getCallerFrame(CallFile, CallLine, CallColumn,
+ CallDiscriminator);
+ const DWARFDebugLine::LineTable *LineTable =
+ Unit->getContext().getLineTableForUnit(Unit);
+ std::string FileName;
+ if (!LineTable->hasFileAtIndex(CallFile))
+ return;
+ if (!LineTable->getFileNameByIndex(
+ CallFile, Unit->getCompilationDir(),
+ DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, FileName))
+ return;
+
+ if (FileName.empty())
+ return;
+
+ const char *MangledCallerName = FuncDie.getName(DINameKind::LinkageName);
+ if (!MangledCallerName)
+ return;
+
+ std::string CallerName = MangledCallerName;
+ std::string CalleeName = Name;
+ if (Demangle) {
+ CallerName = demangle(MangledCallerName);
+ CalleeName = demangle(Name);
+ }
+
+ OS << "; " << FileName << ":" << CallLine << ":" << CallColumn << ": ";
+ if (IsEnd)
+ OS << "end of ";
+ OS << CalleeName << " inlined into " << CallerName << "\n";
+}
+
+bool LiveVariable::liveAtAddress(object::SectionedAddress Addr) const {
if (LocExpr.Range == std::nullopt)
return false;
return LocExpr.Range->SectionIndex == Addr.SectionIndex &&
@@ -49,7 +113,24 @@ void LiveVariable::print(raw_ostream &OS, const MCRegisterInfo &MRI) const {
printDwarfExpressionCompact(&Expression, OS, GetRegName);
}
-void LiveVariablePrinter::addVariable(DWARFDie FuncDie, DWARFDie VarDie) {
+void LiveVariable::dump(raw_ostream &OS) const {
+ OS << Name << " @ " << LocExpr.Range << ": ";
+}
+
+void LiveElementPrinter::addInlinedFunction(DWARFDie FuncDie,
+ DWARFDie InlinedFuncDie) {
+ uint64_t FuncLowPC, FuncHighPC, SectionIndex;
+ if (!InlinedFuncDie.getLowAndHighPC(FuncLowPC, FuncHighPC, SectionIndex))
+ return;
+
+ DWARFUnit *U = InlinedFuncDie.getDwarfUnit();
+ const char *InlinedFuncName = InlinedFuncDie.getName(DINameKind::LinkageName);
+ DWARFAddressRange Range{FuncLowPC, FuncHighPC, SectionIndex};
+ LiveElements.emplace_back(std::make_unique<InlinedFunction>(
+ InlinedFuncName, U, FuncDie, InlinedFuncDie, Range));
+}
+
+void LiveElementPrinter::addVariable(DWARFDie FuncDie, DWARFDie VarDie) {
uint64_t FuncLowPC, FuncHighPC, SectionIndex;
FuncDie.getLowAndHighPC(FuncLowPC, FuncHighPC, SectionIndex);
const char *VarName = VarDie.getName(DINameKind::ShortName);
@@ -67,7 +148,8 @@ void LiveVariablePrinter::addVariable(DWARFDie FuncDie, DWARFDie VarDie) {
for (const DWARFLocationExpression &LocExpr : *Locs) {
if (LocExpr.Range) {
- LiveVariables.emplace_back(LocExpr, VarName, U, FuncDie);
+ LiveElements.emplace_back(
+ std::make_unique<LiveVariable>(LocExpr, VarName, U, FuncDie));
} else {
// If the LocExpr does not have an associated range, it is valid for
// the whole of the function.
@@ -75,24 +157,30 @@ void LiveVariablePrinter::addVariable(DWARFDie FuncDie, DWARFDie VarDie) {
// LocExpr, does that happen in reality?
DWARFLocationExpression WholeFuncExpr{
DWARFAddressRange(FuncLowPC, FuncHighPC, SectionIndex), LocExpr.Expr};
- LiveVariables.emplace_back(WholeFuncExpr, VarName, U, FuncDie);
+ LiveElements.emplace_back(
+ std::make_unique<LiveVariable>(WholeFuncExpr, VarName, U, FuncDie));
}
}
}
-void LiveVariablePrinter::addFunction(DWARFDie D) {
+void LiveElementPrinter::addFunction(DWARFDie D) {
for (const DWARFDie &Child : D.children()) {
- if (Child.getTag() == dwarf::DW_TAG_variable ||
- Child.getTag() == dwarf::DW_TAG_formal_parameter)
+ if (DbgVariables != DFDisabled &&
+ (Child.getTag() == dwarf::DW_TAG_variable ||
+ Child.getTag() == dwarf::DW_TAG_formal_parameter)) {
addVariable(D, Child);
- else
+ } else if (DbgInlinedFunctions != DFDisabled &&
+ Child.getTag() == dwarf::DW_TAG_inlined_subroutine) {
+ addInlinedFunction(D, Child);
+ addFunction(Child);
+ } else
addFunction(Child);
}
}
-// Get the column number (in characters) at which the first live variable
+// Get the column number (in characters) at which the first live element
// line should be printed.
-unsigned LiveVariablePrinter::getIndentLevel() const {
+unsigned LiveElementPrinter::getIndentLevel() const {
return DbgIndent + getInstStartColumn(STI);
}
@@ -100,8 +188,8 @@ unsigned LiveVariablePrinter::getIndentLevel() const {
// printed line, and return the index of that column.
// TODO: formatted_raw_ostream uses "column" to mean a number of characters
// since the last \n, and we use it to mean the number of slots in which we
-// put live variable lines. Pick a less overloaded word.
-unsigned LiveVariablePrinter::moveToFirstVarColumn(formatted_raw_ostream &OS) {
+// put live element lines. Pick a less overloaded word.
+unsigned LiveElementPrinter::moveToFirstVarColumn(formatted_raw_ostream &OS) {
// Logical column number: column zero is the first column we print in, each
// logical column is 2 physical columns wide.
unsigned FirstUnprintedLogicalColumn =
@@ -117,7 +205,7 @@ unsigned LiveVariablePrinter::moveToFirstVarColumn(formatted_raw_ostream &OS) {
return FirstUnprintedLogicalColumn;
}
-unsigned LiveVariablePrinter::findFreeColumn() {
+unsigned LiveElementPrinter::findFreeColumn() {
for (unsigned ColIdx = 0; ColIdx < ActiveCols.size(); ++ColIdx)
if (!ActiveCols[ColIdx].isActive())
return ColIdx;
@@ -127,15 +215,15 @@ unsigned LiveVariablePrinter::findFreeColumn() {
return OldSize;
}
-void LiveVariablePrinter::dump() const {
- for (const LiveVariable &LV : LiveVariables) {
- dbgs() << LV.VarName << " @ " << LV.LocExpr.Range << ": ";
- LV.print(dbgs(), MRI);
+void LiveElementPrinter::dump() const {
+ for (const std::unique_ptr<LiveElement> &LE : LiveElements) {
+ LE->dump(dbgs());
+ LE->print(dbgs(), MRI);
dbgs() << "\n";
}
}
-void LiveVariablePrinter::addCompileUnit(DWARFDie D) {
+void LiveElementPrinter::addCompileUnit(DWARFDie D) {
if (D.getTag() == dwarf::DW_TAG_subprogram)
addFunction(D);
else
@@ -148,47 +236,57 @@ void LiveVariablePrinter::addCompileUnit(DWARFDie D) {
/// live-in to the instruction, and any live range active at NextAddr is
/// live-out of the instruction. If IncludeDefinedVars is false, then live
/// ranges starting at NextAddr will be ignored.
-void LiveVariablePrinter::update(object::SectionedAddress ThisAddr,
- object::SectionedAddress NextAddr,
- bool IncludeDefinedVars) {
+void LiveElementPrinter::update(object::SectionedAddress ThisAddr,
+ object::SectionedAddress NextAddr,
+ bool IncludeDefinedVars) {
+ // Do not create live ranges when debug-inlined-funcs option is provided with
+ // line format option.
+ if (DbgInlinedFunctions == DFLimitsOnly)
+ return;
+
// First, check variables which have already been assigned a column, so
// that we don't change their order.
- SmallSet<unsigned, 8> CheckedVarIdxs;
+ SmallSet<unsigned, 8> CheckedElementIdxs;
for (unsigned ColIdx = 0, End = ActiveCols.size(); ColIdx < End; ++ColIdx) {
if (!ActiveCols[ColIdx].isActive())
continue;
- CheckedVarIdxs.insert(ActiveCols[ColIdx].VarIdx);
- LiveVariable &LV = LiveVariables[ActiveCols[ColIdx].VarIdx];
- ActiveCols[ColIdx].LiveIn = LV.liveAtAddress(ThisAddr);
- ActiveCols[ColIdx].LiveOut = LV.liveAtAddress(NextAddr);
+
+ CheckedElementIdxs.insert(ActiveCols[ColIdx].ElementIdx);
+ const std::unique_ptr<LiveElement> &LE =
+ LiveElements[ActiveCols[ColIdx].ElementIdx];
+ ActiveCols[ColIdx].LiveIn = LE->liveAtAddress(ThisAddr);
+ ActiveCols[ColIdx].LiveOut = LE->liveAtAddress(NextAddr);
+ std::string Name = Demangle ? demangle(LE->getName()) : LE->getName();
LLVM_DEBUG(dbgs() << "pass 1, " << ThisAddr.Address << "-"
- << NextAddr.Address << ", " << LV.VarName << ", Col "
- << ColIdx << ": LiveIn=" << ActiveCols[ColIdx].LiveIn
+ << NextAddr.Address << ", " << Name << ", Col " << ColIdx
+ << ": LiveIn=" << ActiveCols[ColIdx].LiveIn
<< ", LiveOut=" << ActiveCols[ColIdx].LiveOut << "\n");
if (!ActiveCols[ColIdx].LiveIn && !ActiveCols[ColIdx].LiveOut)
- ActiveCols[ColIdx].VarIdx = Column::NullVarIdx;
+ ActiveCols[ColIdx].ElementIdx = Column::NullElementIdx;
}
// Next, look for variables which don't already have a column, but which
// are now live.
if (IncludeDefinedVars) {
- for (unsigned VarIdx = 0, End = LiveVariables.size(); VarIdx < End;
- ++VarIdx) {
- if (CheckedVarIdxs.count(VarIdx))
+ for (unsigned ElementIdx = 0, End = LiveElements.size(); ElementIdx < End;
+ ++ElementIdx) {
+ if (CheckedElementIdxs.count(ElementIdx))
continue;
- LiveVariable &LV = LiveVariables[VarIdx];
- bool LiveIn = LV.liveAtAddress(ThisAddr);
- bool LiveOut = LV.liveAtAddress(NextAddr);
+
+ const std::unique_ptr<LiveElement> &LE = LiveElements[ElementIdx];
+ bool LiveIn = LE->liveAtAddress(ThisAddr);
+ bool LiveOut = LE->liveAtAddress(NextAddr);
if (!LiveIn && !LiveOut)
continue;
unsigned ColIdx = findFreeColumn();
+ std::string Name = Demangle ? demangle(LE->getName()) : LE->getName();
LLVM_DEBUG(dbgs() << "pass 2, " << ThisAddr.Address << "-"
- << NextAddr.Address << ", " << LV.VarName << ", Col "
+ << NextAddr.Address << ", " << Name << ", Col "
<< ColIdx << ": LiveIn=" << LiveIn
<< ", LiveOut=" << LiveOut << "\n");
- ActiveCols[ColIdx].VarIdx = VarIdx;
+ ActiveCols[ColIdx].ElementIdx = ElementIdx;
ActiveCols[ColIdx].LiveIn = LiveIn;
ActiveCols[ColIdx].LiveOut = LiveOut;
ActiveCols[ColIdx].MustDrawLabel = true;
@@ -205,8 +303,8 @@ enum class LineChar {
LabelCornerActive,
LabelHoriz,
};
-const char *LiveVariablePrinter::getLineChar(LineChar C) const {
- bool IsASCII = DbgVariables == DVASCII;
+const char *LiveElementPrinter::getLineChar(LineChar C) const {
+ bool IsASCII = DbgVariables == DFASCII || DbgInlinedFunctions == DFASCII;
switch (C) {
case LineChar::RangeStart:
return IsASCII ? "^" : (const char *)u8"\u2548";
@@ -231,8 +329,8 @@ const char *LiveVariablePrinter::getLineChar(LineChar C) const {
/// we only need to print active ranges or empty columns. If AfterInst is
/// true, this is being printed after the last instruction fed to update(),
/// otherwise this is being printed before it.
-void LiveVariablePrinter::printAfterOtherLine(formatted_raw_ostream &OS,
- bool AfterInst) {
+void LiveElementPrinter::printAfterOtherLine(formatted_raw_ostream &OS,
+ bool AfterInst) {
if (ActiveCols.size()) {
unsigned FirstUnprintedColumn = moveToFirstVarColumn(OS);
for (size_t ColIdx = FirstUnprintedColumn, End = ActiveCols.size();
@@ -252,15 +350,15 @@ void LiveVariablePrinter::printAfterOtherLine(formatted_raw_ostream &OS,
OS << "\n";
}
-/// Print any live variable range info needed to the right of a
-/// non-instruction line of disassembly. This is where we print the variable
+/// Print any live element range info needed to the right of a
+/// non-instruction line of disassembly. This is where we print the element
/// names and expressions, with thin line-drawing characters connecting them
/// to the live range which starts at the next instruction. If MustPrint is
/// true, we have to print at least one line (with the continuation of any
/// already-active live ranges) because something has already been printed
/// earlier on this line.
-void LiveVariablePrinter::printBetweenInsts(formatted_raw_ostream &OS,
- bool MustPrint) {
+void LiveElementPrinter::printBetweenInsts(formatted_raw_ostream &OS,
+ bool MustPrint) {
bool PrintedSomething = false;
for (unsigned ColIdx = 0, End = ActiveCols.size(); ColIdx < End; ++ColIdx) {
if (ActiveCols[ColIdx].isActive() && ActiveCols[ColIdx].MustDrawLabel) {
@@ -277,17 +375,20 @@ void LiveVariablePrinter::printBetweenInsts(formatted_raw_ostream &OS,
OS << " ";
}
+ const std::unique_ptr<LiveElement> &LE =
+ LiveElements[ActiveCols[ColIdx].ElementIdx];
// Then print the variable name and location of the new live range,
// with box drawing characters joining it to the live range line.
OS << getLineChar(ActiveCols[ColIdx].LiveIn ? LineChar::LabelCornerActive
: LineChar::LabelCornerNew)
<< getLineChar(LineChar::LabelHoriz) << " ";
- WithColor(OS, raw_ostream::GREEN)
- << LiveVariables[ActiveCols[ColIdx].VarIdx].VarName;
+
+ std::string Name = Demangle ? demangle(LE->getName()) : LE->getName();
+ WithColor(OS, raw_ostream::GREEN) << Name;
OS << " = ";
{
WithColor ExprColor(OS, raw_ostream::CYAN);
- LiveVariables[ActiveCols[ColIdx].VarIdx].print(OS, MRI);
+ LE->print(OS, MRI);
}
// If there are any columns to the right of the expression we just
@@ -317,8 +418,8 @@ void LiveVariablePrinter::printBetweenInsts(formatted_raw_ostream &OS,
printAfterOtherLine(OS, false);
}
-/// Print the live variable ranges to the right of a disassembled instruction.
-void LiveVariablePrinter::printAfterInst(formatted_raw_ostream &OS) {
+/// Print the live element ranges to the right of a disassembled instruction.
+void LiveElementPrinter::printAfterInst(formatted_raw_ostream &OS) {
if (!ActiveCols.size())
return;
unsigned FirstUnprintedColumn = moveToFirstVarColumn(OS);
@@ -337,6 +438,24 @@ void LiveVariablePrinter::printAfterInst(formatted_raw_ostream &OS) {
}
}
+void LiveElementPrinter::printStartLine(formatted_raw_ostream &OS,
+ object::SectionedAddress Addr) {
+ // Print a line to idenfity the start of an inlined function if line format
+ // is specified.
+ if (DbgInlinedFunctions == DFLimitsOnly)
+ for (const std::unique_ptr<LiveElement> &LE : LiveElements)
+ LE->printElementLine(OS, Addr, false);
+}
+
+void LiveElementPrinter::printEndLine(formatted_raw_ostream &OS,
+ object::SectionedAddress Addr) {
+ // Print a line to idenfity the end of an inlined function if line format is
+ // specified.
+ if (DbgInlinedFunctions == DFLimitsOnly)
+ for (const std::unique_ptr<LiveElement> &LE : LiveElements)
+ LE->printElementLine(OS, Addr, true);
+}
+
bool SourcePrinter::cacheSource(const DILineInfo &LineInfo) {
std::unique_ptr<MemoryBuffer> Buffer;
if (LineInfo.Source) {
@@ -371,7 +490,7 @@ bool SourcePrinter::cacheSource(const DILineInfo &LineInfo) {
void SourcePrinter::printSourceLine(formatted_raw_ostream &OS,
object::SectionedAddress Address,
StringRef ObjectFilename,
- LiveVariablePrinter &LVP,
+ LiveElementPrinter &LEP,
StringRef Delimiter) {
if (!Symbolizer)
return;
@@ -419,15 +538,16 @@ void SourcePrinter::printSourceLine(formatted_raw_ostream &OS,
}
if (PrintLines)
- printLines(OS, LineInfo, Delimiter, LVP);
+ printLines(OS, Address, LineInfo, Delimiter, LEP);
if (PrintSource)
- printSources(OS, LineInfo, ObjectFilename, Delimiter, LVP);
+ printSources(OS, LineInfo, ObjectFilename, Delimiter, LEP);
OldLineInfo = LineInfo;
}
void SourcePrinter::printLines(formatted_raw_ostream &OS,
+ object::SectionedAddress Address,
const DILineInfo &LineInfo, StringRef Delimiter,
- LiveVariablePrinter &LVP) {
+ LiveElementPrinter &LEP) {
bool PrintFunctionName = LineInfo.FunctionName != DILineInfo::BadString &&
LineInfo.FunctionName != OldLineInfo.FunctionName;
if (PrintFunctionName) {
@@ -442,7 +562,7 @@ void SourcePrinter::printLines(formatted_raw_ostream &OS,
(OldLineInfo.Line != LineInfo.Line ||
OldLineInfo.FileName != LineInfo.FileName || PrintFunctionName)) {
OS << Delimiter << LineInfo.FileName << ":" << LineInfo.Line;
- LVP.printBetweenInsts(OS, true);
+ LEP.printBetweenInsts(OS, true);
}
}
@@ -477,7 +597,7 @@ StringRef SourcePrinter::getLine(const DILineInfo &LineInfo,
void SourcePrinter::printSources(formatted_raw_ostream &OS,
const DILineInfo &LineInfo,
StringRef ObjectFilename, StringRef Delimiter,
- LiveVariablePrinter &LVP) {
+ LiveElementPrinter &LEP) {
if (LineInfo.FileName == DILineInfo::BadString || LineInfo.Line == 0 ||
(OldLineInfo.Line == LineInfo.Line &&
OldLineInfo.FileName == LineInfo.FileName))
@@ -486,7 +606,7 @@ void SourcePrinter::printSources(formatted_raw_ostream &OS,
StringRef Line = getLine(LineInfo, ObjectFilename);
if (!Line.empty()) {
OS << Delimiter << Line;
- LVP.printBetweenInsts(OS, true);
+ LEP.printBetweenInsts(OS, true);
}
}
diff --git a/llvm/tools/llvm-objdump/SourcePrinter.h b/llvm/tools/llvm-objdump/SourcePrinter.h
index fc67fc6..5c131a0 100644
--- a/llvm/tools/llvm-objdump/SourcePrinter.h
+++ b/llvm/tools/llvm-objdump/SourcePrinter.h
@@ -22,40 +22,83 @@
namespace llvm {
namespace objdump {
+/// Base class for representing the location of a source-level variable or
+/// an inlined function.
+class LiveElement {
+protected:
+ const char *Name;
+ DWARFUnit *Unit;
+ const DWARFDie FuncDie;
+
+public:
+ LiveElement(const char *Name, DWARFUnit *Unit, const DWARFDie FuncDie)
+ : Name(Name), Unit(Unit), FuncDie(FuncDie) {}
+
+ virtual ~LiveElement() {};
+ const char *getName() const { return Name; }
+
+ virtual bool liveAtAddress(object::SectionedAddress Addr) const = 0;
+ virtual void print(raw_ostream &OS, const MCRegisterInfo &MRI) const = 0;
+ virtual void dump(raw_ostream &OS) const = 0;
+ virtual void printElementLine(raw_ostream &OS,
+ object::SectionedAddress Address,
+ bool IsEnd) const {}
+};
+
+class InlinedFunction : public LiveElement {
+private:
+ DWARFDie InlinedFuncDie;
+ DWARFAddressRange Range;
+
+public:
+ InlinedFunction(const char *FunctionName, DWARFUnit *Unit,
+ const DWARFDie FuncDie, const DWARFDie InlinedFuncDie,
+ DWARFAddressRange &Range)
+ : LiveElement(FunctionName, Unit, FuncDie),
+ InlinedFuncDie(InlinedFuncDie), Range(Range) {}
+
+ bool liveAtAddress(object::SectionedAddress Addr) const override;
+ void print(raw_ostream &OS, const MCRegisterInfo &MRI) const override;
+ void dump(raw_ostream &OS) const override;
+ void printElementLine(raw_ostream &OS, object::SectionedAddress Address,
+ bool IsEnd) const override;
+};
+
/// Stores a single expression representing the location of a source-level
/// variable, along with the PC range for which that expression is valid.
-struct LiveVariable {
+class LiveVariable : public LiveElement {
+private:
DWARFLocationExpression LocExpr;
- const char *VarName;
- DWARFUnit *Unit;
- const DWARFDie FuncDie;
+public:
LiveVariable(const DWARFLocationExpression &LocExpr, const char *VarName,
DWARFUnit *Unit, const DWARFDie FuncDie)
- : LocExpr(LocExpr), VarName(VarName), Unit(Unit), FuncDie(FuncDie) {}
+ : LiveElement(VarName, Unit, FuncDie), LocExpr(LocExpr) {}
- bool liveAtAddress(object::SectionedAddress Addr);
-
- void print(raw_ostream &OS, const MCRegisterInfo &MRI) const;
+ bool liveAtAddress(object::SectionedAddress Addr) const override;
+ void print(raw_ostream &OS, const MCRegisterInfo &MRI) const override;
+ void dump(raw_ostream &OS) const override;
};
-/// Helper class for printing source variable locations alongside disassembly.
-class LiveVariablePrinter {
- // Information we want to track about one column in which we are printing a
- // variable live range.
+/// Helper class for printing source locations for variables and inlined
+/// subroutines alongside disassembly.
+class LiveElementPrinter {
+ // Information we want to track about one column in which we are printing an
+ // element live range.
struct Column {
- unsigned VarIdx = NullVarIdx;
+ unsigned ElementIdx = NullElementIdx;
bool LiveIn = false;
bool LiveOut = false;
bool MustDrawLabel = false;
- bool isActive() const { return VarIdx != NullVarIdx; }
+ bool isActive() const { return ElementIdx != NullElementIdx; }
- static constexpr unsigned NullVarIdx = std::numeric_limits<unsigned>::max();
+ static constexpr unsigned NullElementIdx =
+ std::numeric_limits<unsigned>::max();
};
- // All live variables we know about in the object/image file.
- std::vector<LiveVariable> LiveVariables;
+ // All live elements we know about in the object/image file.
+ std::vector<std::unique_ptr<LiveElement>> LiveElements;
// The columns we are currently drawing.
IndexedMap<Column> ActiveCols;
@@ -63,11 +106,12 @@ class LiveVariablePrinter {
const MCRegisterInfo &MRI;
const MCSubtargetInfo &STI;
+ void addInlinedFunction(DWARFDie FuncDie, DWARFDie InlinedFuncDie);
void addVariable(DWARFDie FuncDie, DWARFDie VarDie);
void addFunction(DWARFDie D);
- // Get the column number (in characters) at which the first live variable
+ // Get the column number (in characters) at which the first live element
// line should be printed.
unsigned getIndentLevel() const;
@@ -75,13 +119,13 @@ class LiveVariablePrinter {
// printed line, and return the index of that column.
// TODO: formatted_raw_ostream uses "column" to mean a number of characters
// since the last \n, and we use it to mean the number of slots in which we
- // put live variable lines. Pick a less overloaded word.
+ // put live element lines. Pick a less overloaded word.
unsigned moveToFirstVarColumn(formatted_raw_ostream &OS);
unsigned findFreeColumn();
public:
- LiveVariablePrinter(const MCRegisterInfo &MRI, const MCSubtargetInfo &STI)
+ LiveElementPrinter(const MCRegisterInfo &MRI, const MCSubtargetInfo &STI)
: ActiveCols(Column()), MRI(MRI), STI(STI) {}
void dump() const;
@@ -114,7 +158,7 @@ public:
/// otherwise this is being printed before it.
void printAfterOtherLine(formatted_raw_ostream &OS, bool AfterInst);
- /// Print any live variable range info needed to the right of a
+ /// Print any live element range info needed to the right of a
/// non-instruction line of disassembly. This is where we print the variable
/// names and expressions, with thin line-drawing characters connecting them
/// to the live range which starts at the next instruction. If MustPrint is
@@ -123,8 +167,13 @@ public:
/// earlier on this line.
void printBetweenInsts(formatted_raw_ostream &OS, bool MustPrint);
- /// Print the live variable ranges to the right of a disassembled instruction.
+ /// Print the live element ranges to the right of a disassembled instruction.
void printAfterInst(formatted_raw_ostream &OS);
+
+ /// Print a line to idenfity the start of a live element.
+ void printStartLine(formatted_raw_ostream &OS, object::SectionedAddress Addr);
+ /// Print a line to idenfity the end of a live element.
+ void printEndLine(formatted_raw_ostream &OS, object::SectionedAddress Addr);
};
class SourcePrinter {
@@ -144,12 +193,13 @@ protected:
private:
bool cacheSource(const DILineInfo &LineInfoFile);
- void printLines(formatted_raw_ostream &OS, const DILineInfo &LineInfo,
- StringRef Delimiter, LiveVariablePrinter &LVP);
+ void printLines(formatted_raw_ostream &OS, object::SectionedAddress Address,
+ const DILineInfo &LineInfo, StringRef Delimiter,
+ LiveElementPrinter &LEP);
void printSources(formatted_raw_ostream &OS, const DILineInfo &LineInfo,
StringRef ObjectFilename, StringRef Delimiter,
- LiveVariablePrinter &LVP);
+ LiveElementPrinter &LEP);
// Returns line source code corresponding to `LineInfo`.
// Returns empty string if source code cannot be found.
@@ -162,7 +212,7 @@ public:
virtual void printSourceLine(formatted_raw_ostream &OS,
object::SectionedAddress Address,
StringRef ObjectFilename,
- LiveVariablePrinter &LVP,
+ LiveElementPrinter &LEP,
StringRef Delimiter = "; ");
};
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 74eb903..0316c4b 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -348,7 +348,8 @@ static bool Wide;
std::string objdump::Prefix;
uint32_t objdump::PrefixStrip;
-DebugVarsFormat objdump::DbgVariables = DVDisabled;
+DebugFormat objdump::DbgVariables = DFDisabled;
+DebugFormat objdump::DbgInlinedFunctions = DFDisabled;
int objdump::DbgIndent = 52;
@@ -523,8 +524,8 @@ static const Target *getTarget(const ObjectFile *Obj) {
// Get the target specific parser.
std::string Error;
- const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, TheTriple,
- Error);
+ const Target *TheTarget =
+ TargetRegistry::lookupTarget(ArchName, TheTriple, Error);
if (!TheTarget)
reportError(Obj->getFileName(), "can't find target: " + Error);
@@ -633,7 +634,7 @@ static bool isCSKYElf(const ObjectFile &Obj) {
}
static bool hasMappingSymbols(const ObjectFile &Obj) {
- return isArmElf(Obj) || isAArch64Elf(Obj) || isCSKYElf(Obj) ;
+ return isArmElf(Obj) || isAArch64Elf(Obj) || isCSKYElf(Obj);
}
static void printRelocation(formatted_raw_ostream &OS, StringRef FileName,
@@ -653,7 +654,7 @@ static void printRelocation(formatted_raw_ostream &OS, StringRef FileName,
static void printBTFRelocation(formatted_raw_ostream &FOS, llvm::BTFParser &BTF,
object::SectionedAddress Address,
- LiveVariablePrinter &LVP) {
+ LiveElementPrinter &LEP) {
const llvm::BTF::BPFFieldReloc *Reloc = BTF.findFieldReloc(Address);
if (!Reloc)
return;
@@ -664,7 +665,7 @@ static void printBTFRelocation(formatted_raw_ostream &FOS, llvm::BTFParser &BTF,
if (LeadingAddr)
FOS << format("%016" PRIx64 ": ", Address.Address + AdjustVMA);
FOS << "CO-RE " << Val;
- LVP.printAfterOtherLine(FOS, true);
+ LEP.printAfterOtherLine(FOS, true);
}
class PrettyPrinter {
@@ -675,10 +676,11 @@ public:
object::SectionedAddress Address, formatted_raw_ostream &OS,
StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
- LiveVariablePrinter &LVP) {
+ LiveElementPrinter &LEP) {
if (SP && (PrintSource || PrintLines))
- SP->printSourceLine(OS, Address, ObjectFilename, LVP);
- LVP.printBetweenInsts(OS, false);
+ SP->printSourceLine(OS, Address, ObjectFilename, LEP);
+ LEP.printStartLine(OS, Address);
+ LEP.printBetweenInsts(OS, false);
printRawData(Bytes, Address.Address, OS, STI);
@@ -698,7 +700,7 @@ public:
const MCAsmInfo &MAI,
const MCSubtargetInfo &STI,
StringRef Comments,
- LiveVariablePrinter &LVP) {
+ LiveElementPrinter &LEP) {
do {
if (!Comments.empty()) {
// Emit a line of comments.
@@ -712,7 +714,7 @@ public:
FOS.PadToColumn(CommentColumn);
FOS << MAI.getCommentString() << ' ' << Comment;
}
- LVP.printAfterInst(FOS);
+ LEP.printAfterInst(FOS);
FOS << "\n";
} while (!Comments.empty());
FOS.flush();
@@ -757,10 +759,10 @@ public:
void emitPostInstructionInfo(formatted_raw_ostream &FOS, const MCAsmInfo &MAI,
const MCSubtargetInfo &STI, StringRef Comments,
- LiveVariablePrinter &LVP) override {
+ LiveElementPrinter &LEP) override {
// Hexagon does not write anything to the comment stream, so we can just
// print the separator.
- LVP.printAfterInst(FOS);
+ LEP.printAfterInst(FOS);
FOS << getInstructionSeparator();
FOS.flush();
if (ShouldClosePacket)
@@ -771,9 +773,9 @@ public:
object::SectionedAddress Address, formatted_raw_ostream &OS,
StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
- LiveVariablePrinter &LVP) override {
+ LiveElementPrinter &LEP) override {
if (SP && (PrintSource || PrintLines))
- SP->printSourceLine(OS, Address, ObjectFilename, LVP, "");
+ SP->printSourceLine(OS, Address, ObjectFilename, LEP, "");
if (!MI) {
printLead(Bytes, Address.Address, OS);
OS << " <unknown>";
@@ -784,7 +786,7 @@ public:
StringRef Preamble = IsStartOfBundle ? " { " : " ";
if (SP && (PrintSource || PrintLines))
- SP->printSourceLine(OS, Address, ObjectFilename, LVP, "");
+ SP->printSourceLine(OS, Address, ObjectFilename, LEP, "");
printLead(Bytes, Address.Address, OS);
OS << Preamble;
std::string Buf;
@@ -845,9 +847,9 @@ public:
object::SectionedAddress Address, formatted_raw_ostream &OS,
StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
- LiveVariablePrinter &LVP) override {
+ LiveElementPrinter &LEP) override {
if (SP && (PrintSource || PrintLines))
- SP->printSourceLine(OS, Address, ObjectFilename, LVP);
+ SP->printSourceLine(OS, Address, ObjectFilename, LEP);
if (MI) {
SmallString<40> InstStr;
@@ -866,10 +868,10 @@ public:
support::endian::read32<llvm::endianness::little>(Bytes.data()));
OS.indent(42);
} else {
- OS << format("\t.byte 0x%02" PRIx8, Bytes[0]);
- for (unsigned int i = 1; i < Bytes.size(); i++)
- OS << format(", 0x%02" PRIx8, Bytes[i]);
- OS.indent(55 - (6 * Bytes.size()));
+ OS << format("\t.byte 0x%02" PRIx8, Bytes[0]);
+ for (unsigned int i = 1; i < Bytes.size(); i++)
+ OS << format(", 0x%02" PRIx8, Bytes[i]);
+ OS.indent(55 - (6 * Bytes.size()));
}
}
@@ -880,7 +882,7 @@ public:
for (uint32_t D :
ArrayRef(reinterpret_cast<const support::little32_t *>(Bytes.data()),
Bytes.size() / 4))
- OS << format(" %08" PRIX32, D);
+ OS << format(" %08" PRIX32, D);
} else {
for (unsigned char B : Bytes)
OS << format(" %02" PRIX8, B);
@@ -898,9 +900,9 @@ public:
object::SectionedAddress Address, formatted_raw_ostream &OS,
StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
- LiveVariablePrinter &LVP) override {
+ LiveElementPrinter &LEP) override {
if (SP && (PrintSource || PrintLines))
- SP->printSourceLine(OS, Address, ObjectFilename, LVP);
+ SP->printSourceLine(OS, Address, ObjectFilename, LEP);
if (LeadingAddr)
OS << format("%8" PRId64 ":", Address.Address / 8);
if (ShowRawInsn) {
@@ -921,10 +923,11 @@ public:
object::SectionedAddress Address, formatted_raw_ostream &OS,
StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
- LiveVariablePrinter &LVP) override {
+ LiveElementPrinter &LEP) override {
if (SP && (PrintSource || PrintLines))
- SP->printSourceLine(OS, Address, ObjectFilename, LVP);
- LVP.printBetweenInsts(OS, false);
+ SP->printSourceLine(OS, Address, ObjectFilename, LEP);
+ LEP.printStartLine(OS, Address);
+ LEP.printBetweenInsts(OS, false);
size_t Start = OS.tell();
if (LeadingAddr)
@@ -975,10 +978,11 @@ public:
object::SectionedAddress Address, formatted_raw_ostream &OS,
StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
- LiveVariablePrinter &LVP) override {
+ LiveElementPrinter &LEP) override {
if (SP && (PrintSource || PrintLines))
- SP->printSourceLine(OS, Address, ObjectFilename, LVP);
- LVP.printBetweenInsts(OS, false);
+ SP->printSourceLine(OS, Address, ObjectFilename, LEP);
+ LEP.printStartLine(OS, Address);
+ LEP.printBetweenInsts(OS, false);
size_t Start = OS.tell();
if (LeadingAddr)
@@ -1013,10 +1017,11 @@ public:
object::SectionedAddress Address, formatted_raw_ostream &OS,
StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
- LiveVariablePrinter &LVP) override {
+ LiveElementPrinter &LEP) override {
if (SP && (PrintSource || PrintLines))
- SP->printSourceLine(OS, Address, ObjectFilename, LVP);
- LVP.printBetweenInsts(OS, false);
+ SP->printSourceLine(OS, Address, ObjectFilename, LEP);
+ LEP.printStartLine(OS, Address);
+ LEP.printBetweenInsts(OS, false);
size_t Start = OS.tell();
if (LeadingAddr)
@@ -1057,7 +1062,7 @@ public:
RISCVPrettyPrinter RISCVPrettyPrinterInst;
PrettyPrinter &selectPrettyPrinter(Triple const &Triple) {
- switch(Triple.getArch()) {
+ switch (Triple.getArch()) {
default:
return PrettyPrinterInst;
case Triple::hexagon:
@@ -1108,8 +1113,7 @@ private:
DisassemblerTarget::DisassemblerTarget(const Target *TheTarget, ObjectFile &Obj,
StringRef TripleName, StringRef MCPU,
SubtargetFeatures &Features)
- : TheTarget(TheTarget),
- Printer(&selectPrettyPrinter(Triple(TripleName))),
+ : TheTarget(TheTarget), Printer(&selectPrettyPrinter(Triple(TripleName))),
RegisterInfo(TheTarget->createMCRegInfo(TripleName)) {
if (!RegisterInfo)
reportError(Obj.getFileName(), "no register info for target " + TripleName);
@@ -1388,7 +1392,6 @@ static bool shouldAdjustVA(const SectionRef &Section) {
return false;
}
-
typedef std::pair<uint64_t, char> MappingSymbolPair;
static char getMappingSymbolKind(ArrayRef<MappingSymbolPair> MappingSymbols,
uint64_t Address) {
@@ -1416,8 +1419,7 @@ static uint64_t dumpARMELFData(uint64_t SectionAddr, uint64_t Index,
dumpBytes(Bytes.slice(Index, 4), OS);
AlignToInstStartColumn(Start, STI, OS);
OS << "\t.word\t"
- << format_hex(support::endian::read32(Bytes.data() + Index, Endian),
- 10);
+ << format_hex(support::endian::read32(Bytes.data() + Index, Endian), 10);
return 4;
}
if (Index + 2 <= End) {
@@ -1791,9 +1793,9 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
// STAB symbol's section field refers to a valid section index. Otherwise
// the symbol may error trying to load a section that does not exist.
DataRefImpl SymDRI = Symbol.getRawDataRefImpl();
- uint8_t NType = (MachO->is64Bit() ?
- MachO->getSymbol64TableEntry(SymDRI).n_type:
- MachO->getSymbolTableEntry(SymDRI).n_type);
+ uint8_t NType =
+ (MachO->is64Bit() ? MachO->getSymbol64TableEntry(SymDRI).n_type
+ : MachO->getSymbolTableEntry(SymDRI).n_type);
if (NType & MachO::N_STAB)
continue;
}
@@ -1892,15 +1894,15 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
llvm::stable_sort(AbsoluteSymbols);
std::unique_ptr<DWARFContext> DICtx;
- LiveVariablePrinter LVP(*DT->Context->getRegisterInfo(), *DT->SubtargetInfo);
+ LiveElementPrinter LEP(*DT->Context->getRegisterInfo(), *DT->SubtargetInfo);
- if (DbgVariables != DVDisabled) {
+ if (DbgVariables != DFDisabled || DbgInlinedFunctions != DFDisabled) {
DICtx = DWARFContext::create(DbgObj);
for (const std::unique_ptr<DWARFUnit> &CU : DICtx->compile_units())
- LVP.addCompileUnit(CU->getUnitDIE(false));
+ LEP.addCompileUnit(CU->getUnitDIE(false));
}
- LLVM_DEBUG(LVP.dump());
+ LLVM_DEBUG(LEP.dump());
BBAddrMapInfo FullAddrMap;
auto ReadBBAddrMap = [&](std::optional<unsigned> SectionIndex =
@@ -2368,8 +2370,9 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
ThisBytes.size(),
DT->DisAsm->suggestBytesToSkip(ThisBytes, ThisAddr));
- LVP.update({Index, Section.getIndex()},
- {Index + Size, Section.getIndex()}, Index + Size != End);
+ LEP.update({ThisAddr, Section.getIndex()},
+ {ThisAddr + Size, Section.getIndex()},
+ Index + Size != End);
DT->InstPrinter->setCommentStream(CommentStream);
@@ -2377,7 +2380,7 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
*DT->InstPrinter, Disassembled ? &Inst : nullptr,
Bytes.slice(Index, Size),
{SectionAddr + Index + VMAAdjustment, Section.getIndex()}, FOS,
- "", *DT->SubtargetInfo, &SP, Obj.getFileName(), &Rels, LVP);
+ "", *DT->SubtargetInfo, &SP, Obj.getFileName(), &Rels, LEP);
DT->InstPrinter->setCommentStream(llvm::nulls());
@@ -2562,22 +2565,26 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
assert(DT->Context->getAsmInfo());
DT->Printer->emitPostInstructionInfo(FOS, *DT->Context->getAsmInfo(),
*DT->SubtargetInfo,
- CommentStream.str(), LVP);
+ CommentStream.str(), LEP);
Comments.clear();
if (BTF)
- printBTFRelocation(FOS, *BTF, {Index, Section.getIndex()}, LVP);
+ printBTFRelocation(FOS, *BTF, {Index, Section.getIndex()}, LEP);
if (InlineRelocs) {
while (findRel()) {
// When --adjust-vma is used, update the address printed.
printRelocation(FOS, Obj.getFileName(), *RelCur,
SectionAddr + RelOffset + VMAAdjustment, Is64Bits);
- LVP.printAfterOtherLine(FOS, true);
+ LEP.printAfterOtherLine(FOS, true);
++RelCur;
}
}
+ object::SectionedAddress NextAddr = {
+ SectionAddr + Index + VMAAdjustment + Size, Section.getIndex()};
+ LEP.printEndLine(FOS, NextAddr);
+
Index += Size;
}
}
@@ -2869,7 +2876,8 @@ void objdump::printSectionContents(const ObjectFile *Obj) {
continue;
}
- StringRef Contents = unwrapOrError(Section.getContents(), Obj->getFileName());
+ StringRef Contents =
+ unwrapOrError(Section.getContents(), Obj->getFileName());
// Dump out the content as hex and printable ascii characters.
for (std::size_t Addr = 0, End = Contents.size(); Addr < End; Addr += 16) {
@@ -3293,8 +3301,8 @@ static bool shouldWarnForInvalidStartStopAddress(ObjectFile *Obj) {
return false;
}
-static void checkForInvalidStartStopAddress(ObjectFile *Obj,
- uint64_t Start, uint64_t Stop) {
+static void checkForInvalidStartStopAddress(ObjectFile *Obj, uint64_t Start,
+ uint64_t Stop) {
if (!shouldWarnForInvalidStartStopAddress(Obj))
return;
@@ -3617,13 +3625,25 @@ static void parseObjdumpOptions(const llvm::opt::InputArgList &InputArgs) {
Prefix = InputArgs.getLastArgValue(OBJDUMP_prefix).str();
parseIntArg(InputArgs, OBJDUMP_prefix_strip, PrefixStrip);
if (const opt::Arg *A = InputArgs.getLastArg(OBJDUMP_debug_vars_EQ)) {
- DbgVariables = StringSwitch<DebugVarsFormat>(A->getValue())
- .Case("ascii", DVASCII)
- .Case("unicode", DVUnicode)
- .Default(DVInvalid);
- if (DbgVariables == DVInvalid)
+ DbgVariables = StringSwitch<DebugFormat>(A->getValue())
+ .Case("ascii", DFASCII)
+ .Case("unicode", DFUnicode)
+ .Default(DFInvalid);
+ if (DbgVariables == DFInvalid)
+ invalidArgValue(A);
+ }
+
+ if (const opt::Arg *A =
+ InputArgs.getLastArg(OBJDUMP_debug_inlined_funcs_EQ)) {
+ DbgInlinedFunctions = StringSwitch<DebugFormat>(A->getValue())
+ .Case("ascii", DFASCII)
+ .Case("limits-only", DFLimitsOnly)
+ .Case("unicode", DFUnicode)
+ .Default(DFInvalid);
+ if (DbgInlinedFunctions == DFInvalid)
invalidArgValue(A);
}
+
if (const opt::Arg *A = InputArgs.getLastArg(OBJDUMP_disassembler_color_EQ)) {
DisassemblyColor = StringSwitch<ColorOutput>(A->getValue())
.Case("on", ColorOutput::Enable)
@@ -3634,7 +3654,7 @@ static void parseObjdumpOptions(const llvm::opt::InputArgList &InputArgs) {
invalidArgValue(A);
}
- parseIntArg(InputArgs, OBJDUMP_debug_vars_indent_EQ, DbgIndent);
+ parseIntArg(InputArgs, OBJDUMP_debug_indent_EQ, DbgIndent);
parseMachOOptions(InputArgs);
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.h b/llvm/tools/llvm-objdump/llvm-objdump.h
index 25d9c1e..ce06429 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.h
+++ b/llvm/tools/llvm-objdump/llvm-objdump.h
@@ -40,11 +40,12 @@ class XCOFFObjectFile;
namespace objdump {
-enum DebugVarsFormat { DVDisabled, DVUnicode, DVASCII, DVInvalid };
+enum DebugFormat { DFASCII, DFDisabled, DFInvalid, DFLimitsOnly, DFUnicode };
extern bool ArchiveHeaders;
extern int DbgIndent;
-extern DebugVarsFormat DbgVariables;
+extern DebugFormat DbgVariables;
+extern DebugFormat DbgInlinedFunctions;
extern bool Demangle;
extern bool Disassemble;
extern bool DisassembleAll;
@@ -126,7 +127,7 @@ void printSectionContents(const object::ObjectFile *O);
void reportWarning(const Twine &Message, StringRef File);
template <typename T, typename... Ts>
-T unwrapOrError(Expected<T> EO, Ts &&... Args) {
+T unwrapOrError(Expected<T> EO, Ts &&...Args) {
if (EO)
return std::move(*EO);
reportError(EO.takeError(), std::forward<Ts>(Args)...);
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index ccc64fe..2699e10 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -30,6 +30,7 @@
#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/BinaryFormat/MsgPackDocument.h"
+#include "llvm/BinaryFormat/SFrame.h"
#include "llvm/Demangle/Demangle.h"
#include "llvm/Object/Archive.h"
#include "llvm/Object/ELF.h"
@@ -38,6 +39,7 @@
#include "llvm/Object/Error.h"
#include "llvm/Object/ObjectFile.h"
#include "llvm/Object/RelocationResolver.h"
+#include "llvm/Object/SFrameParser.h"
#include "llvm/Object/StackMapParser.h"
#include "llvm/Support/AArch64AttributeParser.h"
#include "llvm/Support/AMDGPUMetadata.h"
@@ -225,6 +227,8 @@ public:
void printArchSpecificInfo() override;
void printStackMap() const override;
void printMemtag() override;
+ void printSectionsAsSFrame(ArrayRef<std::string> Sections) override;
+
ArrayRef<uint8_t> getMemtagGlobalsSectionContents(uint64_t ExpectedAddr);
// Hash histogram shows statistics of how efficient the hash was for the
@@ -1083,26 +1087,25 @@ const EnumEntry<unsigned> ElfObjectFileType[] = {
};
const EnumEntry<unsigned> ElfOSABI[] = {
- {"SystemV", "UNIX - System V", ELF::ELFOSABI_NONE},
- {"HPUX", "UNIX - HP-UX", ELF::ELFOSABI_HPUX},
- {"NetBSD", "UNIX - NetBSD", ELF::ELFOSABI_NETBSD},
- {"GNU/Linux", "UNIX - GNU", ELF::ELFOSABI_LINUX},
- {"GNU/Hurd", "GNU/Hurd", ELF::ELFOSABI_HURD},
- {"Solaris", "UNIX - Solaris", ELF::ELFOSABI_SOLARIS},
- {"AIX", "UNIX - AIX", ELF::ELFOSABI_AIX},
- {"IRIX", "UNIX - IRIX", ELF::ELFOSABI_IRIX},
- {"FreeBSD", "UNIX - FreeBSD", ELF::ELFOSABI_FREEBSD},
- {"TRU64", "UNIX - TRU64", ELF::ELFOSABI_TRU64},
- {"Modesto", "Novell - Modesto", ELF::ELFOSABI_MODESTO},
- {"OpenBSD", "UNIX - OpenBSD", ELF::ELFOSABI_OPENBSD},
- {"OpenVMS", "VMS - OpenVMS", ELF::ELFOSABI_OPENVMS},
- {"NSK", "HP - Non-Stop Kernel", ELF::ELFOSABI_NSK},
- {"AROS", "AROS", ELF::ELFOSABI_AROS},
- {"FenixOS", "FenixOS", ELF::ELFOSABI_FENIXOS},
- {"CloudABI", "CloudABI", ELF::ELFOSABI_CLOUDABI},
- {"CUDA", "NVIDIA - CUDA", ELF::ELFOSABI_CUDA},
- {"Standalone", "Standalone App", ELF::ELFOSABI_STANDALONE}
-};
+ {"SystemV", "UNIX - System V", ELF::ELFOSABI_NONE},
+ {"HPUX", "UNIX - HP-UX", ELF::ELFOSABI_HPUX},
+ {"NetBSD", "UNIX - NetBSD", ELF::ELFOSABI_NETBSD},
+ {"GNU/Linux", "UNIX - GNU", ELF::ELFOSABI_LINUX},
+ {"GNU/Hurd", "GNU/Hurd", ELF::ELFOSABI_HURD},
+ {"Solaris", "UNIX - Solaris", ELF::ELFOSABI_SOLARIS},
+ {"AIX", "UNIX - AIX", ELF::ELFOSABI_AIX},
+ {"IRIX", "UNIX - IRIX", ELF::ELFOSABI_IRIX},
+ {"FreeBSD", "UNIX - FreeBSD", ELF::ELFOSABI_FREEBSD},
+ {"TRU64", "UNIX - TRU64", ELF::ELFOSABI_TRU64},
+ {"Modesto", "Novell - Modesto", ELF::ELFOSABI_MODESTO},
+ {"OpenBSD", "UNIX - OpenBSD", ELF::ELFOSABI_OPENBSD},
+ {"OpenVMS", "VMS - OpenVMS", ELF::ELFOSABI_OPENVMS},
+ {"NSK", "HP - Non-Stop Kernel", ELF::ELFOSABI_NSK},
+ {"AROS", "AROS", ELF::ELFOSABI_AROS},
+ {"FenixOS", "FenixOS", ELF::ELFOSABI_FENIXOS},
+ {"CloudABI", "CloudABI", ELF::ELFOSABI_CLOUDABI},
+ {"CUDA", "NVIDIA - CUDA", ELF::ELFOSABI_CUDA},
+ {"Standalone", "Standalone App", ELF::ELFOSABI_STANDALONE}};
const EnumEntry<unsigned> AMDGPUElfOSABI[] = {
{"AMDGPU_HSA", "AMDGPU - HSA", ELF::ELFOSABI_AMDGPU_HSA},
@@ -1667,16 +1670,17 @@ const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion4[] = {
};
const EnumEntry<unsigned> ElfHeaderNVPTXFlags[] = {
- ENUM_ENT(EF_CUDA_SM20, "sm_20"), ENUM_ENT(EF_CUDA_SM21, "sm_21"),
- ENUM_ENT(EF_CUDA_SM30, "sm_30"), ENUM_ENT(EF_CUDA_SM32, "sm_32"),
- ENUM_ENT(EF_CUDA_SM35, "sm_35"), ENUM_ENT(EF_CUDA_SM37, "sm_37"),
- ENUM_ENT(EF_CUDA_SM50, "sm_50"), ENUM_ENT(EF_CUDA_SM52, "sm_52"),
- ENUM_ENT(EF_CUDA_SM53, "sm_53"), ENUM_ENT(EF_CUDA_SM60, "sm_60"),
- ENUM_ENT(EF_CUDA_SM61, "sm_61"), ENUM_ENT(EF_CUDA_SM62, "sm_62"),
- ENUM_ENT(EF_CUDA_SM70, "sm_70"), ENUM_ENT(EF_CUDA_SM72, "sm_72"),
- ENUM_ENT(EF_CUDA_SM75, "sm_75"), ENUM_ENT(EF_CUDA_SM80, "sm_80"),
- ENUM_ENT(EF_CUDA_SM86, "sm_86"), ENUM_ENT(EF_CUDA_SM87, "sm_87"),
- ENUM_ENT(EF_CUDA_SM89, "sm_89"), ENUM_ENT(EF_CUDA_SM90, "sm_90"),
+ ENUM_ENT(EF_CUDA_SM20, "sm_20"), ENUM_ENT(EF_CUDA_SM21, "sm_21"),
+ ENUM_ENT(EF_CUDA_SM30, "sm_30"), ENUM_ENT(EF_CUDA_SM32, "sm_32"),
+ ENUM_ENT(EF_CUDA_SM35, "sm_35"), ENUM_ENT(EF_CUDA_SM37, "sm_37"),
+ ENUM_ENT(EF_CUDA_SM50, "sm_50"), ENUM_ENT(EF_CUDA_SM52, "sm_52"),
+ ENUM_ENT(EF_CUDA_SM53, "sm_53"), ENUM_ENT(EF_CUDA_SM60, "sm_60"),
+ ENUM_ENT(EF_CUDA_SM61, "sm_61"), ENUM_ENT(EF_CUDA_SM62, "sm_62"),
+ ENUM_ENT(EF_CUDA_SM70, "sm_70"), ENUM_ENT(EF_CUDA_SM72, "sm_72"),
+ ENUM_ENT(EF_CUDA_SM75, "sm_75"), ENUM_ENT(EF_CUDA_SM80, "sm_80"),
+ ENUM_ENT(EF_CUDA_SM86, "sm_86"), ENUM_ENT(EF_CUDA_SM87, "sm_87"),
+ ENUM_ENT(EF_CUDA_SM89, "sm_89"), ENUM_ENT(EF_CUDA_SM90, "sm_90"),
+ ENUM_ENT(EF_CUDA_SM100, "sm_100"), ENUM_ENT(EF_CUDA_SM120, "sm_120"),
};
const EnumEntry<unsigned> ElfHeaderRISCVFlags[] = {
@@ -3651,10 +3655,16 @@ template <class ELFT> void GNUELFDumper<ELFT>::printFileHeaders() {
else if (e.e_machine == EM_XTENSA)
ElfFlags = printFlags(e.e_flags, ArrayRef(ElfHeaderXtensaFlags),
unsigned(ELF::EF_XTENSA_MACH));
- else if (e.e_machine == EM_CUDA)
+ else if (e.e_machine == EM_CUDA) {
ElfFlags = printFlags(e.e_flags, ArrayRef(ElfHeaderNVPTXFlags),
unsigned(ELF::EF_CUDA_SM));
- else if (e.e_machine == EM_AMDGPU) {
+ if (e.e_ident[ELF::EI_ABIVERSION] == ELF::ELFABIVERSION_CUDA_V1 &&
+ (e.e_flags & ELF::EF_CUDA_ACCELERATORS_V1))
+ ElfFlags += "a";
+ else if (e.e_ident[ELF::EI_ABIVERSION] == ELF::ELFABIVERSION_CUDA_V2 &&
+ (e.e_flags & ELF::EF_CUDA_ACCELERATORS))
+ ElfFlags += "a";
+ } else if (e.e_machine == EM_AMDGPU) {
switch (e.e_ident[ELF::EI_ABIVERSION]) {
default:
break;
@@ -6429,6 +6439,61 @@ template <typename ELFT> void ELFDumper<ELFT>::printMemtag() {
printMemtag(DynamicEntries, AndroidNoteDesc, GlobalDescriptors);
}
+template <typename ELFT>
+void ELFDumper<ELFT>::printSectionsAsSFrame(ArrayRef<std::string> Sections) {
+ constexpr endianness E = ELFT::Endianness;
+ for (object::SectionRef Section :
+ getSectionRefsByNameOrIndex(ObjF, Sections)) {
+ // Validity of sections names checked in getSectionRefsByNameOrIndex.
+ StringRef SectionName = cantFail(Section.getName());
+
+ DictScope SectionScope(W,
+ formatv("SFrame section '{0}'", SectionName).str());
+
+ StringRef SectionContent;
+ if (Error Err = Section.getContents().moveInto(SectionContent)) {
+ reportWarning(std::move(Err), FileName);
+ continue;
+ }
+
+ Expected<object::SFrameParser<E>> Parser =
+ object::SFrameParser<E>::create(arrayRefFromStringRef(SectionContent));
+ if (!Parser) {
+ reportWarning(createError("invalid sframe section: " +
+ toString(Parser.takeError())),
+ FileName);
+ continue;
+ }
+
+ DictScope HeaderScope(W, "Header");
+
+ const sframe::Preamble<E> &Preamble = Parser->getPreamble();
+ W.printHex("Magic", Preamble.Magic.value());
+ W.printEnum("Version", Preamble.Version.value(), sframe::getVersions());
+ W.printFlags("Flags", Preamble.Flags.value(), sframe::getFlags());
+
+ const sframe::Header<E> &Header = Parser->getHeader();
+ W.printEnum("ABI", Header.ABIArch.value(), sframe::getABIs());
+
+ W.printNumber(("CFA fixed FP offset" +
+ Twine(Parser->usesFixedFPOffset() ? "" : " (unused)"))
+ .str(),
+ Header.CFAFixedFPOffset.value());
+
+ W.printNumber(("CFA fixed RA offset" +
+ Twine(Parser->usesFixedRAOffset() ? "" : " (unused)"))
+ .str(),
+ Header.CFAFixedRAOffset.value());
+
+ W.printNumber("Auxiliary header length", Header.AuxHdrLen.value());
+ W.printNumber("Num FDEs", Header.NumFDEs.value());
+ W.printNumber("Num FREs", Header.NumFREs.value());
+ W.printNumber("FRE subsection length", Header.FRELen.value());
+ W.printNumber("FDE subsection offset", Header.FDEOff.value());
+ W.printNumber("FRE subsection offset", Header.FREOff.value());
+ }
+}
+
template <class ELFT> void GNUELFDumper<ELFT>::printELFLinkerOptions() {
OS << "printELFLinkerOptions not implemented!\n";
}
diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp
index 1a535ed..bd670ae 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -102,9 +102,9 @@ void ObjDumper::printFileSummary(StringRef FileStr, object::ObjectFile &Obj,
this->printLoadName();
}
-static std::vector<object::SectionRef>
-getSectionRefsByNameOrIndex(const object::ObjectFile &Obj,
- ArrayRef<std::string> Sections) {
+std::vector<object::SectionRef>
+ObjDumper::getSectionRefsByNameOrIndex(const object::ObjectFile &Obj,
+ ArrayRef<std::string> Sections) {
std::vector<object::SectionRef> Ret;
std::map<std::string, bool, std::less<>> SecNames;
std::map<unsigned, bool> SecIndices;
diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h
index a76afbe..1dc2966 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/llvm/tools/llvm-readobj/ObjDumper.h
@@ -139,6 +139,7 @@ public:
virtual void printSectionDetails() {}
virtual void printArchSpecificInfo() {}
virtual void printMemtag() {}
+ virtual void printSectionsAsSFrame(ArrayRef<std::string> Sections) {}
// Only implemented for PE/COFF.
virtual void printCOFFImports() { }
@@ -190,6 +191,10 @@ public:
protected:
ScopedPrinter &W;
+ static std::vector<object::SectionRef>
+ getSectionRefsByNameOrIndex(const object::ObjectFile &Obj,
+ ArrayRef<std::string> Sections);
+
private:
virtual void printSymbols(bool ExtraSymInfo) {}
virtual void printSymbols(std::optional<SymbolComparator> Comp) {}
diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td
index f95461a..48d43cc 100644
--- a/llvm/tools/llvm-readobj/Opts.td
+++ b/llvm/tools/llvm-readobj/Opts.td
@@ -62,6 +62,8 @@ def memtag : FF<"memtag", "Display memory tagging metadata (modes, Android notes
def needed_libs : FF<"needed-libs", "Display the needed libraries">, Group<grp_elf>;
def notes : FF<"notes", "Display notes">, Group<grp_elf>;
def program_headers : FF<"program-headers", "Display program headers">, Group<grp_elf>;
+def sframe_EQ : Joined<["--"], "sframe=">, HelpText<"Display SFrame section <name>">, MetaVarName<"<name>">, Group<grp_elf>;
+def sframe: FF<"sframe", "Alias for --sframe=.sframe">, Alias<sframe_EQ>, AliasArgs<[".sframe"]>, Group<grp_elf>;
def version_info : FF<"version-info", "Display version sections">, Group<grp_elf>;
// Mach-O specific options.
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp
index 1231c02..4c84ed7 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -137,6 +137,7 @@ static bool NeededLibraries;
static bool Notes;
static bool ProgramHeaders;
static bool SectionGroups;
+static std::vector<std::string> SFrame;
static bool VersionInfo;
// Mach-O specific options.
@@ -275,6 +276,7 @@ static void parseOptions(const opt::InputArgList &Args) {
opts::PrettyPrint = Args.hasArg(OPT_pretty_print);
opts::ProgramHeaders = Args.hasArg(OPT_program_headers);
opts::SectionGroups = Args.hasArg(OPT_section_groups);
+ opts::SFrame = Args.getAllArgValues(OPT_sframe_EQ);
if (Arg *A = Args.getLastArg(OPT_sort_symbols_EQ)) {
for (StringRef KeyStr : llvm::split(A->getValue(), ",")) {
SortSymbolKeyTy KeyType = StringSwitch<SortSymbolKeyTy>(KeyStr)
@@ -478,6 +480,8 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer,
Dumper->printNotes();
if (opts::Memtag)
Dumper->printMemtag();
+ if (!opts::SFrame.empty())
+ Dumper->printSectionsAsSFrame(opts::SFrame);
}
if (Obj.isCOFF()) {
if (opts::COFFImports)
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 4b47655..6af2006 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -915,11 +915,11 @@ TEST(ValueTracking, propagatesPoison) {
{true, "call float @llvm.sin.f32(float %fx)", 0},
{true, "call float @llvm.cos.f32(float %fx)", 0},
{true, "call float @llvm.pow.f32(float %fx, float %fy)", 0},
- {false, "call float @llvm.exp.f32(float %fx)", 0},
- {false, "call float @llvm.exp2.f32(float %fx)", 0},
- {false, "call float @llvm.log.f32(float %fx)", 0},
- {false, "call float @llvm.log10.f32(float %fx)", 0},
- {false, "call float @llvm.log2.f32(float %fx)", 0},
+ {true, "call float @llvm.exp.f32(float %fx)", 0},
+ {true, "call float @llvm.exp2.f32(float %fx)", 0},
+ {true, "call float @llvm.log.f32(float %fx)", 0},
+ {true, "call float @llvm.log10.f32(float %fx)", 0},
+ {true, "call float @llvm.log2.f32(float %fx)", 0},
{false, "call float @llvm.fma.f32(float %fx, float %fx, float %fy)", 0},
{false, "call float @llvm.fabs.f32(float %fx)", 0},
{false, "call float @llvm.minnum.f32(float %fx, float %fy)", 0},
@@ -927,17 +927,17 @@ TEST(ValueTracking, propagatesPoison) {
{false, "call float @llvm.minimum.f32(float %fx, float %fy)", 0},
{false, "call float @llvm.maximum.f32(float %fx, float %fy)", 0},
{false, "call float @llvm.copysign.f32(float %fx, float %fy)", 0},
- {false, "call float @llvm.floor.f32(float %fx)", 0},
- {false, "call float @llvm.ceil.f32(float %fx)", 0},
- {false, "call float @llvm.trunc.f32(float %fx)", 0},
- {false, "call float @llvm.rint.f32(float %fx)", 0},
- {false, "call float @llvm.nearbyint.f32(float %fx)", 0},
- {false, "call float @llvm.round.f32(float %fx)", 0},
- {false, "call float @llvm.roundeven.f32(float %fx)", 0},
+ {true, "call float @llvm.floor.f32(float %fx)", 0},
+ {true, "call float @llvm.ceil.f32(float %fx)", 0},
+ {true, "call float @llvm.trunc.f32(float %fx)", 0},
+ {true, "call float @llvm.rint.f32(float %fx)", 0},
+ {true, "call float @llvm.nearbyint.f32(float %fx)", 0},
+ {true, "call float @llvm.round.f32(float %fx)", 0},
+ {true, "call float @llvm.roundeven.f32(float %fx)", 0},
{false, "call i32 @llvm.lround.f32(float %fx)", 0},
{false, "call i64 @llvm.llround.f32(float %fx)", 0},
- {false, "call i32 @llvm.lrint.f32(float %fx)", 0},
- {false, "call i64 @llvm.llrint.f32(float %fx)", 0},
+ {true, "call i32 @llvm.lrint.f32(float %fx)", 0},
+ {true, "call i64 @llvm.llrint.f32(float %fx)", 0},
{false, "call float @llvm.fmuladd.f32(float %fx, float %fx, float %fy)",
0}};
diff --git a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
index 6189d09..95c26b1 100644
--- a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
@@ -431,8 +431,8 @@ TEST_F(OpenMPDecompositionTest, Firstprivate3) {
std::string Dir0 = stringify(Dec.output[0]);
std::string Dir1 = stringify(Dec.output[1]);
std::string Dir2 = stringify(Dec.output[2]);
- ASSERT_EQ(Dir0, "target map(2, , , , (x))"); // (12), (27)
- ASSERT_EQ(Dir1, "teams shared(x)"); // (6), (17)
+ ASSERT_EQ(Dir0, "target map(2, , , , , (x))"); // (12), (27)
+ ASSERT_EQ(Dir1, "teams shared(x)"); // (6), (17)
ASSERT_EQ(Dir2, "distribute firstprivate(x) lastprivate(, (x))"); // (5), (21)
}
@@ -574,9 +574,9 @@ TEST_F(OpenMPDecompositionTest, Lastprivate3) {
std::string Dir0 = stringify(Dec.output[0]);
std::string Dir1 = stringify(Dec.output[1]);
std::string Dir2 = stringify(Dec.output[2]);
- ASSERT_EQ(Dir0, "target map(2, , , , (x))"); // (21), (27)
- ASSERT_EQ(Dir1, "parallel shared(x)"); // (22)
- ASSERT_EQ(Dir2, "do lastprivate(, (x))"); // (21)
+ ASSERT_EQ(Dir0, "target map(2, , , , , (x))"); // (21), (27)
+ ASSERT_EQ(Dir1, "parallel shared(x)"); // (22)
+ ASSERT_EQ(Dir2, "do lastprivate(, (x))"); // (21)
}
// SHARED
@@ -984,9 +984,9 @@ TEST_F(OpenMPDecompositionTest, Reduction7) {
std::string Dir0 = stringify(Dec.output[0]);
std::string Dir1 = stringify(Dec.output[1]);
std::string Dir2 = stringify(Dec.output[2]);
- ASSERT_EQ(Dir0, "target map(2, , , , (x))"); // (36), (10)
- ASSERT_EQ(Dir1, "parallel shared(x)"); // (36), (1), (4)
- ASSERT_EQ(Dir2, "do reduction(, (3), (x))"); // (36)
+ ASSERT_EQ(Dir0, "target map(2, , , , , (x))"); // (36), (10)
+ ASSERT_EQ(Dir1, "parallel shared(x)"); // (36), (1), (4)
+ ASSERT_EQ(Dir2, "do reduction(, (3), (x))"); // (36)
}
// IF
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index baa13e1..0065615 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -185,10 +185,8 @@ TEST(MetadataTest, DeleteInstUsedByDbgRecord) {
Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt();
// Find the dbg.value using %b.
- SmallVector<DbgValueInst *, 1> DVIs;
SmallVector<DbgVariableRecord *, 1> DVRs;
- findDbgValues(DVIs, &I, &DVRs);
- assert(DVIs.empty());
+ findDbgValues(&I, DVRs);
// Delete %b. The dbg.value should now point to undef.
I.eraseFromParent();
@@ -230,7 +228,6 @@ TEST(MetadataTest, GlobalConstantMetadataUsedByDbgRecord) {
Value *V = M->getNamedValue("x");
// Find the dbg.value
- auto DVIs = findDbgDeclares(V);
auto DVRs = findDVRDeclares(V);
auto DVRVs = findDVRValues(V);
@@ -312,10 +309,8 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt();
// Find the DbgVariableRecords using %b.
- SmallVector<DbgValueInst *, 2> DVIs;
SmallVector<DbgVariableRecord *, 2> DVRs;
- findDbgValues(DVIs, &I, &DVRs);
- assert(DVIs.empty());
+ findDbgValues(&I, DVRs);
ASSERT_EQ(DVRs.size(), 2u);
// Delete %b. The DbgVariableRecord should now point to undef.
@@ -359,11 +354,9 @@ TEST(MetadataTest, OrderingOfDbgVariableRecords) {
Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt();
- SmallVector<DbgValueInst *, 2> DVIs;
SmallVector<DbgVariableRecord *, 2> DVRs;
- findDbgValues(DVIs, &I, &DVRs);
- ASSERT_EQ(DVIs.size(), 0u);
+ findDbgValues(&I, DVRs);
ASSERT_EQ(DVRs.size(), 2u);
// The correct order of dbg.values is given by their use-list, which becomes
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index d048e87..868c40b 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -31,6 +31,7 @@ add_llvm_unittest(SupportTests
DataExtractorTest.cpp
DebugCounterTest.cpp
DebugTest.cpp
+ DebugLogTest.cpp
DivisionByConstantTest.cpp
DJBTest.cpp
EndianStreamTest.cpp
diff --git a/llvm/unittests/Support/DebugLogTest.cpp b/llvm/unittests/Support/DebugLogTest.cpp
new file mode 100644
index 0000000..5136999
--- /dev/null
+++ b/llvm/unittests/Support/DebugLogTest.cpp
@@ -0,0 +1,77 @@
+//===- llvm/unittest/Support/DebugLogTest.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include <string>
+using namespace llvm;
+using testing::Eq;
+using testing::HasSubstr;
+
+#ifndef NDEBUG
+TEST(DebugLogTest, Basic) {
+ llvm::DebugFlag = true;
+ static const char *DT[] = {"A", "B"};
+
+ // Clear debug types.
+ setCurrentDebugTypes(DT, 0);
+ {
+ std::string str;
+ raw_string_ostream os(str);
+ DEBUGLOG_WITH_STREAM_AND_TYPE(os, nullptr) << "NoType";
+ EXPECT_TRUE(StringRef(os.str()).starts_with('['));
+ EXPECT_TRUE(StringRef(os.str()).ends_with("NoType\n"));
+ }
+
+ setCurrentDebugTypes(DT, 2);
+ {
+ std::string str;
+ raw_string_ostream os(str);
+ DEBUGLOG_WITH_STREAM_AND_TYPE(os, "A") << "A";
+ DEBUGLOG_WITH_STREAM_AND_TYPE(os, "B") << "B";
+ EXPECT_THAT(os.str(), AllOf(HasSubstr("A\n"), HasSubstr("B\n")));
+ }
+
+ setCurrentDebugType("A");
+ {
+ std::string str;
+ raw_string_ostream os(str);
+ // Just check that the macro doesn't result in dangling else.
+ if (true)
+ DEBUGLOG_WITH_STREAM_AND_TYPE(os, "A") << "A";
+ else
+ DEBUGLOG_WITH_STREAM_AND_TYPE(os, "A") << "B";
+ DEBUGLOG_WITH_STREAM_AND_TYPE(os, "B") << "B";
+ EXPECT_THAT(os.str(), AllOf(HasSubstr("A\n"), Not(HasSubstr("B\n"))));
+
+ int count = 0;
+ auto inc = [&]() { return ++count; };
+ EXPECT_THAT(count, Eq(0));
+ DEBUGLOG_WITH_STREAM_AND_TYPE(os, "A") << inc();
+ EXPECT_THAT(count, Eq(1));
+ DEBUGLOG_WITH_STREAM_AND_TYPE(os, "B") << inc();
+ EXPECT_THAT(count, Eq(1));
+ }
+}
+#else
+TEST(DebugLogTest, Basic) {
+ // LDBG should be compiled out in NDEBUG, so just check it compiles and has
+ // no effect.
+ llvm::DebugFlag = true;
+ static const char *DT[] = {};
+ setCurrentDebugTypes(DT, 0);
+ int count = 0;
+ auto inc = [&]() { return ++count; };
+ EXPECT_THAT(count, Eq(0));
+ LDBG() << inc();
+ EXPECT_THAT(count, Eq(0));
+}
+#endif
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index dd2a624..0c70feb 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -673,20 +673,17 @@ TEST(Local, FindDbgRecords) {
Function &Fun = *cast<Function>(M->getNamedValue("fun"));
Value *Arg = Fun.getArg(0);
- SmallVector<DbgVariableIntrinsic *> Users;
SmallVector<DbgVariableRecord *> Records;
// Arg (%a) is used twice by a single dbg_assign. Check findDbgUsers returns
// only 1 pointer to it rather than 2.
- findDbgUsers(Users, Arg, &Records);
- EXPECT_EQ(Users.size(), 0u);
+ findDbgUsers(Arg, Records);
EXPECT_EQ(Records.size(), 1u);
SmallVector<DbgValueInst *> Vals;
Records.clear();
// Arg (%a) is used twice by a single dbg_assign. Check findDbgValues returns
// only 1 pointer to it rather than 2.
- findDbgValues(Vals, Arg, &Records);
- EXPECT_EQ(Vals.size(), 0u);
+ findDbgValues(Arg, Records);
EXPECT_EQ(Records.size(), 1u);
}
@@ -787,20 +784,16 @@ TEST(Local, ReplaceAllDbgUsesWith) {
// Simulate i32* <-> i64* conversion.
EXPECT_TRUE(replaceAllDbgUsesWith(D, C, C, DT));
- SmallVector<DbgVariableIntrinsic *, 2> CDbgVals;
SmallVector<DbgVariableRecord *, 2> CDbgRecords;
- findDbgUsers(CDbgVals, &C, &CDbgRecords);
- EXPECT_EQ(0U, CDbgVals.size());
+ findDbgUsers(&C, CDbgRecords);
EXPECT_EQ(2U, CDbgRecords.size());
EXPECT_TRUE(all_of(
CDbgRecords, [](DbgVariableRecord *DVR) { return DVR->isDbgDeclare(); }));
EXPECT_TRUE(replaceAllDbgUsesWith(C, D, D, DT));
- SmallVector<DbgVariableIntrinsic *, 2> DDbgVals;
SmallVector<DbgVariableRecord *, 2> DDbgRecords;
- findDbgUsers(DDbgVals, &D, &DDbgRecords);
- EXPECT_EQ(0U, DDbgVals.size());
+ findDbgUsers(&D, DDbgRecords);
EXPECT_EQ(2U, DDbgRecords.size());
EXPECT_TRUE(all_of(
DDbgRecords, [](DbgVariableRecord *DVR) { return DVR->isDbgDeclare(); }));
@@ -824,10 +817,8 @@ TEST(Local, ReplaceAllDbgUsesWith) {
EXPECT_EQ(BarrierDbgVal->getNumVariableLocationOps(), 1u);
EXPECT_TRUE(BarrierDbgVal->isKillLocation());
- SmallVector<DbgValueInst *, 1> BarrierDbgVals;
SmallVector<DbgVariableRecord *, 8> BarrierDbgRecs;
- findDbgValues(BarrierDbgVals, &F_, &BarrierDbgRecs);
- EXPECT_EQ(0U, BarrierDbgVals.size());
+ findDbgValues(&F_, BarrierDbgRecs);
EXPECT_EQ(0U, BarrierDbgRecs.size());
// Simulate i32 -> i64 conversion to test sign-extension. Here are some
@@ -838,10 +829,8 @@ TEST(Local, ReplaceAllDbgUsesWith) {
// 4-6) like (1-3), but with a fragment
EXPECT_TRUE(replaceAllDbgUsesWith(B, A, A, DT));
- SmallVector<DbgValueInst *, 8> BDbgVals;
SmallVector<DbgVariableRecord *, 8> BDbgRecs;
- findDbgValues(BDbgVals, &A, &BDbgRecs);
- EXPECT_EQ(0U, BDbgVals.size());
+ findDbgValues(&A, BDbgRecs);
EXPECT_EQ(6U, BDbgRecs.size());
// Check that %a has a dbg.value with a DIExpression matching \p Ops.
diff --git a/llvm/utils/TableGen/CompressInstEmitter.cpp b/llvm/utils/TableGen/CompressInstEmitter.cpp
index afc892b..e80adce 100644
--- a/llvm/utils/TableGen/CompressInstEmitter.cpp
+++ b/llvm/utils/TableGen/CompressInstEmitter.cpp
@@ -217,12 +217,8 @@ void CompressInstEmitter::addDagOperandMapping(const Record *Rec,
Inst.Operands.back().MIOperandNo + Inst.Operands.back().MINumOperands;
OperandMap.grow(NumMIOperands);
- // TiedCount keeps track of the number of operands skipped in Inst
- // operands list to get to the corresponding Dag operand. This is
- // necessary because the number of operands in Inst might be greater
- // than number of operands in the Dag due to how tied operands
- // are represented.
- unsigned TiedCount = 0;
+ // Tied operands are not represented in the DAG so we count them separately.
+ unsigned DAGOpNo = 0;
unsigned OpNo = 0;
for (const auto &Opnd : Inst.Operands) {
int TiedOpIdx = Opnd.getTiedRegister();
@@ -231,15 +227,25 @@ void CompressInstEmitter::addDagOperandMapping(const Record *Rec,
// Set the entry in OperandMap for the tied operand we're skipping.
OperandMap[OpNo] = OperandMap[TiedOpIdx];
++OpNo;
- ++TiedCount;
+
+ // Source instructions can have at most 1 tied operand.
+ if (IsSourceInst && (OpNo - DAGOpNo > 1))
+ PrintFatalError(Rec->getLoc(),
+ "Input operands for Inst '" + Inst.TheDef->getName() +
+ "' and input Dag operand count mismatch");
+
continue;
}
- for (unsigned SubOp = 0; SubOp != Opnd.MINumOperands; ++SubOp, ++OpNo) {
- unsigned DAGOpNo = OpNo - TiedCount;
+ for (unsigned SubOp = 0; SubOp != Opnd.MINumOperands;
+ ++SubOp, ++OpNo, ++DAGOpNo) {
const Record *OpndRec = Opnd.Rec;
if (Opnd.MINumOperands > 1)
OpndRec = cast<DefInit>(Opnd.MIOperandInfo->getArg(SubOp))->getDef();
+ if (DAGOpNo >= Dag->getNumArgs())
+ PrintFatalError(Rec->getLoc(), "Inst '" + Inst.TheDef->getName() +
+ "' and Dag operand count mismatch");
+
if (const auto *DI = dyn_cast<DefInit>(Dag->getArg(DAGOpNo))) {
if (DI->getDef()->isSubClassOf("Register")) {
// Check if the fixed register belongs to the Register class.
@@ -269,7 +275,7 @@ void CompressInstEmitter::addDagOperandMapping(const Record *Rec,
OperandMap[OpNo].Kind = OpData::Operand;
} else if (const auto *II = dyn_cast<IntInit>(Dag->getArg(DAGOpNo))) {
// Validate that corresponding instruction operand expects an immediate.
- if (OpndRec->isSubClassOf("RegisterClass"))
+ if (!OpndRec->isSubClassOf("Operand"))
PrintFatalError(Rec->getLoc(), "Error in Dag '" + Dag->getAsString() +
"' Found immediate: '" +
II->getAsString() +
@@ -312,43 +318,11 @@ void CompressInstEmitter::addDagOperandMapping(const Record *Rec,
Operands[ArgName] = {DAGOpNo, OpNo};
}
}
-}
-// Verify the Dag operand count is enough to build an instruction.
-static bool verifyDagOpCount(const CodeGenInstruction &Inst, const DagInit *Dag,
- bool IsSource) {
- unsigned NumMIOperands = 0;
-
- unsigned TiedOpCount = 0;
- for (const auto &Op : Inst.Operands) {
- NumMIOperands += Op.MINumOperands;
- if (Op.getTiedRegister() != -1)
- TiedOpCount++;
- }
-
- if (Dag->getNumArgs() == NumMIOperands)
- return true;
-
- // Source instructions are non compressed instructions and have at most one
- // tied operand.
- if (IsSource && (TiedOpCount > 1))
- PrintFatalError(Inst.TheDef->getLoc(),
- "Input operands for Inst '" + Inst.TheDef->getName() +
- "' and input Dag operand count mismatch");
-
- // The Dag can't have more arguments than the Instruction.
- if (Dag->getNumArgs() > NumMIOperands)
- PrintFatalError(Inst.TheDef->getLoc(),
- "Inst '" + Inst.TheDef->getName() +
- "' and Dag operand count mismatch");
-
- // The Instruction might have tied operands so the Dag might have
- // a fewer operand count.
- if (Dag->getNumArgs() != (NumMIOperands - TiedOpCount))
- PrintFatalError(Inst.TheDef->getLoc(),
- "Inst '" + Inst.TheDef->getName() +
- "' and Dag operand count mismatch");
- return true;
+ // We shouldn't have extra Dag operands.
+ if (DAGOpNo != Dag->getNumArgs())
+ PrintFatalError(Rec->getLoc(), "Inst '" + Inst.TheDef->getName() +
+ "' and Dag operand count mismatch");
}
// Check that all names in the source DAG appear in the destionation DAG.
@@ -463,7 +437,6 @@ void CompressInstEmitter::evaluateCompressPat(const Record *Rec) {
// Checking we are transforming from compressed to uncompressed instructions.
const Record *SourceOperator = SourceDag->getOperatorAsDef(Rec->getLoc());
CodeGenInstruction SourceInst(SourceOperator);
- verifyDagOpCount(SourceInst, SourceDag, true);
// Validate output Dag operands.
const DagInit *DestDag = Rec->getValueAsDag("Output");
@@ -472,7 +445,6 @@ void CompressInstEmitter::evaluateCompressPat(const Record *Rec) {
const Record *DestOperator = DestDag->getOperatorAsDef(Rec->getLoc());
CodeGenInstruction DestInst(DestOperator);
- verifyDagOpCount(DestInst, DestDag, false);
if (SourceOperator->getValueAsInt("Size") <=
DestOperator->getValueAsInt("Size"))
diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py
index 3754aa2..c2829c1 100644
--- a/llvm/utils/UpdateTestChecks/asm.py
+++ b/llvm/utils/UpdateTestChecks/asm.py
@@ -593,6 +593,7 @@ def get_run_handler(triple):
"riscv64": (scrub_asm_riscv, ASM_FUNCTION_RISCV_RE),
"lanai": (scrub_asm_lanai, ASM_FUNCTION_LANAI_RE),
"sparc": (scrub_asm_sparc, ASM_FUNCTION_SPARC_RE),
+ "spirv": (scrub_asm_spirv, ASM_FUNCTION_SPIRV_RE),
"spirv32": (scrub_asm_spirv, ASM_FUNCTION_SPIRV_RE),
"spirv64": (scrub_asm_spirv, ASM_FUNCTION_SPIRV_RE),
"s390x": (scrub_asm_systemz, ASM_FUNCTION_SYSTEMZ_RE),
diff --git a/llvm/utils/gn/build/BUILD.gn b/llvm/utils/gn/build/BUILD.gn
index 9b5254e..f080a4c6 100644
--- a/llvm/utils/gn/build/BUILD.gn
+++ b/llvm/utils/gn/build/BUILD.gn
@@ -179,6 +179,7 @@ config("compiler_defaults") {
"_HAS_EXCEPTIONS=0",
"_UNICODE",
"UNICODE",
+ "CLANG_BUILD_STATIC",
]
cflags += [ "/EHs-c-" ]
cflags_cc += [ "/std:c++17" ]
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/llvm/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/llvm/BUILD.gn
index ef804af..c7cccc4 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/llvm/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/llvm/BUILD.gn
@@ -20,5 +20,6 @@ static_library("llvm") {
"PreferRegisterOverUnsignedCheck.cpp",
"PreferStaticOverAnonymousNamespaceCheck.cpp",
"TwineLocalCheck.cpp",
+ "UseNewMLIROpBuilderCheck.cpp",
]
}
diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn
index ab5dae8..ac2ce0c 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn
@@ -8,6 +8,7 @@ unittest("ClangAnalysisTests") {
"//clang/lib/Analysis",
"//clang/lib/Basic",
"//clang/lib/Frontend",
+ "//clang/lib/Testing",
"//clang/lib/Tooling",
"//llvm/lib/Support",
]
@@ -17,6 +18,7 @@ unittest("ClangAnalysisTests") {
"CloneDetectionTest.cpp",
"ExprMutationAnalyzerTest.cpp",
"IntervalPartitionTest.cpp",
+ "LifetimeSafetyTest.cpp",
"MacroExpansionContextTest.cpp",
"UnsafeBufferUsageTest.cpp",
]
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 5309b5d..1f83a7c 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1192,6 +1192,7 @@ if (current_toolchain == default_toolchain) {
"__locale_dir/time.h",
"__locale_dir/wbuffer_convert.h",
"__locale_dir/wstring_convert.h",
+ "__log_hardening_failure",
"__math/abs.h",
"__math/copysign.h",
"__math/error_functions.h",
@@ -1394,6 +1395,7 @@ if (current_toolchain == default_toolchain) {
"__ranges/transform_view.h",
"__ranges/view_interface.h",
"__ranges/views.h",
+ "__ranges/zip_transform_view.h",
"__ranges/zip_view.h",
"__split_buffer",
"__std_mbstate_t.h",
@@ -1437,7 +1439,6 @@ if (current_toolchain == default_toolchain) {
"__tuple/make_tuple_types.h",
"__tuple/sfinae_helpers.h",
"__tuple/tuple_element.h",
- "__tuple/tuple_indices.h",
"__tuple/tuple_like.h",
"__tuple/tuple_like_ext.h",
"__tuple/tuple_like_no_subrange.h",
diff --git a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
index b9e8d07..327a8ed 100644
--- a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
@@ -317,7 +317,10 @@ if (libcxx_enable_experimental) {
static_library("cxx_experimental") {
output_dir = runtimes_dir
output_name = "c++experimental"
- sources = [ "experimental/keep.cpp" ]
+ sources = [
+ "experimental/keep.cpp",
+ "experimental/log_hardening_failure.cpp",
+ ]
if (libcxx_enable_filesystem && libcxx_enable_time_zone_database) {
sources += [
# TODO TZDB The exception could be moved in chrono once the TZDB library
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
index b6b8f2f..1612144 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
@@ -62,8 +62,13 @@ static_library("CPlusPlus") {
"LibStdcppUniquePointer.cpp",
"MSVCUndecoratedNameParser.cpp",
"MsvcStl.cpp",
+ "MsvcStlAtomic.cpp",
+ "MsvcStlDeque.cpp",
"MsvcStlSmartPointer.cpp",
+ "MsvcStlTree.cpp",
"MsvcStlTuple.cpp",
+ "MsvcStlUnordered.cpp",
+ "MsvcStlVariant.cpp",
"MsvcStlVector.cpp",
]
}
diff --git a/llvm/utils/gn/secondary/lldb/test/BUILD.gn b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
index 6dcce2d..586f9fd 100644
--- a/llvm/utils/gn/secondary/lldb/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
@@ -118,6 +118,7 @@ write_lit_cfg("lit_shell_site_cfg") {
"CLANG_RESOURCE_DIR=",
"DEFAULT_SYSROOT=",
"LIBCXX_LIBRARY_DIR=" + rebase_path("$root_build_dir/lib"),
+ "LLDB_BUILD_LLDBRPC=0", # FIXME: add lldb-rpc-gen target, enable
"LLDB_ENABLE_LUA=0", # FIXME: gn arg, use in Config.h
"LLDB_ENABLE_LZMA=0", # FIXME: gn arg, use in Config.h
"LLDB_ENABLE_PYTHON=0", # FIXME: gn arg, use in Config.h
diff --git a/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
index 2959d22..1a890f6 100644
--- a/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
@@ -17,6 +17,7 @@ static_library("BinaryFormat") {
"MsgPackDocumentYAML.cpp",
"MsgPackReader.cpp",
"MsgPackWriter.cpp",
+ "SFrame.cpp",
"Wasm.cpp",
"XCOFF.cpp",
]
diff --git a/llvm/utils/gn/secondary/llvm/lib/Object/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Object/BUILD.gn
index 883c648..7d55ac8 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Object/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Object/BUILD.gn
@@ -42,6 +42,7 @@ static_library("Object") {
"OffloadBundle.cpp",
"RecordStreamer.cpp",
"RelocationResolver.cpp",
+ "SFrameParser.cpp",
"SymbolSize.cpp",
"SymbolicFile.cpp",
"TapiFile.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
index d327e81..d84c8a6 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
@@ -76,6 +76,7 @@ static_library("Utils") {
"MoveAutoInit.cpp",
"NameAnonGlobals.cpp",
"PredicateInfo.cpp",
+ "ProfileVerify.cpp",
"PromoteMemoryToRegister.cpp",
"RelLookupTableConverter.cpp",
"SCCPSolver.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
index 3aaec30..bcb8535 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
@@ -34,6 +34,7 @@ unittest("SupportTests") {
"DJBTest.cpp",
"DataExtractorTest.cpp",
"DebugCounterTest.cpp",
+ "DebugLogTest.cpp",
"DebugTest.cpp",
"DivisionByConstantTest.cpp",
"ELFAttributeParserTest.cpp",
diff --git a/llvm/utils/update_mir_regclass_numbers b/llvm/utils/update_mir_regclass_numbers
new file mode 100755
index 0000000..21a8ae2
--- /dev/null
+++ b/llvm/utils/update_mir_regclass_numbers
@@ -0,0 +1,27 @@
+#!/bin/sh
+set -e
+
+# Update operands like "1966090 /* regdef:VGPR_32 */" in MIR tests when register
+# class numbers change.
+
+if [ $# -eq 0 ] ; then
+ echo "usage: ${0##*/} /path/to/<Target>GenRegisterInfo.inc test/CodeGen/<Target>/testfile.mir..." >&2
+ exit 1
+fi
+
+reginfo="$1"
+shift
+
+files=$(grep -El ' [0-9]+ /\* [a-z-]+:\w+ \*/' "$@")
+[ "$files" ] || exit 0
+
+grep -Eho ' [0-9]+ /\* [a-z-]+:\w+ \*/' $files | sed -E 's/.*:(\w+).*/\1/' | sort -u | while read -r class ; do
+ id=$(grep -E "^ ${class}RegClassID = " "$reginfo" | sed -E 's/.* = ([0-9]+).*/\1/')
+ if [ "$id" ] ; then
+ echo "$class..."
+ sed -Ei -e 's| [0-9]+ (/\* reguse:'"$class"' \*/)| '"$(((id + 1) << 16 | 9))"' \1|g' \
+ -e 's| [0-9]+ (/\* regdef:'"$class"' \*/)| '"$(((id + 1) << 16 | 10))"' \1|g' \
+ -e 's| [0-9]+ (/\* regdef-ec:'"$class"' \*/)| '"$(((id + 1) << 16 | 11))"' \1|g' \
+ $files
+ fi
+done
diff --git a/llvm/utils/update_mir_test_checks.py b/llvm/utils/update_mir_test_checks.py
index 8db46ad..ca46e1e 100755
--- a/llvm/utils/update_mir_test_checks.py
+++ b/llvm/utils/update_mir_test_checks.py
@@ -35,9 +35,14 @@ from UpdateTestChecks import common
VREG_RE = re.compile(r"(%[0-9]+)(?:\.[a-z0-9_]+)?(?::[a-z0-9_]+)?(?:\([<>a-z0-9 ]+\))?")
MI_FLAGS_STR = (
r"(frame-setup |frame-destroy |nnan |ninf |nsz |arcp |contract |afn "
- r"|reassoc |nuw |nsw |exact |nofpexcept |nomerge |disjoint )*"
+ r"|reassoc |nuw |nsw |exact |nofpexcept |nomerge |unpredictable "
+ r"|noconvergent |nneg |disjoint |nusw |samesign )*"
)
VREG_DEF_FLAGS_STR = r"(?:dead |undef )*"
+
+# Pattern to match the defined vregs and the opcode of an instruction that
+# defines vregs. Opcodes starting with a lower-case 't' are allowed to match
+# ARM's thumb instructions, like tADDi8 and t2ADDri.
VREG_DEF_RE = re.compile(
r"^ *(?P<vregs>{2}{0}(?:, {2}{0})*) = "
r"{1}(?P<opcode>[A-Zt][A-Za-z0-9_]+)".format(